Code
library(haven)
library(poLCA)
library(dplyr)
library(ggplot2)
library(tidyr)
library(skimr)
library(kableExtra)
library(MASS)
library(wesanderson)
library(ggrepel)
library(here)
library(emmeans)
library(Hmisc)
library(sjstats)
library(readr)NOTE: In plots, where there is “n=”, this figure refers to the total number of respondents in the row/column. This presentation is somewhat misleading and will be changed in future iterations.
library(haven)
library(poLCA)
library(dplyr)
library(ggplot2)
library(tidyr)
library(skimr)
library(kableExtra)
library(MASS)
library(wesanderson)
library(ggrepel)
library(here)
library(emmeans)
library(Hmisc)
library(sjstats)
library(readr)rm(list = ls())
colours <- wes_palette("GrandBudapest2",4,"discrete")
better_colours <- c('#8dd3c7','#bebada','#fb8072','#80b1d3','#fdb462')
many_colours <- c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928','#8dd3c7','#ffffb3','#bebada','#fb8072','#80b1d3','#fdb462','#b3de69','#fccde5','#d9d9d9','#bc80bd','#ccebc5','#ffed6f')#data <- haven::read_sav("../Data/2024-04-25 - Cleaned_Data.sav")
data <- readRDS("../Data/2024-05-13 - Cleaned_Data.rds") url <- "https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/employmentandemployeetypes/datasets/nationallabourmarketsummarybyregions01/current/lmregsummaryfebruary2024.xls"
filename <- basename(url)
filepath <- here("./data/", filename)
if(!file.exists(filepath)){
cat("Downloading data\n")
download.file(url, destfile = filepath, mode = "wb")
} else{
cat("Data already in directory. Loading it.\n")
}Data already in directory. Loading it.
employed <- readxl::read_xls(filepath, sheet = "S01.1", range = "F13:F24", col_names = F)
area_names <- readxl::read_xls(filepath, sheet = "S01.1", range = "B13:B24",col_names = F)
rgn_empl_denoms <- data.frame(area_names, employed) %>%
mutate(across(where(is.numeric), ~.*1000)) # *1000 to get real number
colnames(rgn_empl_denoms) <- c("Region","Employed")
rgn_empl_denoms <- rgn_empl_denoms %>%
mutate(
Weight = Employed/sum(Employed)
)total_outsourced <- data %>%
group_by(outsourcing_status) %>%
summarise(
Sum = sum(NatRepemployees)
) %>%
mutate(
Proportion = Sum / sum(Sum),
Percentage = 100 * Proportion
)
readr::write_csv(total_outsourced, file="../outputs/data/total_outsourced.csv")
# Create function to find nearest denominator to express as a fraction.
f <- function(x) ifelse(abs(1/floor(1/x) - x) < abs(1/ceiling(1/x) - x),floor(1/x),ceiling(1/x))Based on this definition, we’ve found that just under 17% of UK workers are ‘outsourced’1. Who makes up this group of 17% of UK workers?
total_outsourced <- data %>%
group_by(outsourcing_group) %>%
summarise(
Sum = sum(NatRepemployees)
) %>%
mutate(
Proportion = Sum / sum(Sum),
Percentage = 100 * Proportion
)
readr::write_csv(total_outsourced, file="../outputs/data/total_outsourced.csv")In terms of the the different possible types of outsourced groups, the numbers are as follows:
# filter to just cases where income is abovve the fifth percentile and lower than the 95th? I.e., drop the top and bottom 5%.
income_statistics <- data %>%
filter(income_drop_all == 0 & !is.na(income_annual_all)) %>%
group_by(outsourcing_status) %>%
summarise(
mean = weighted.mean(income_annual_all, w = NatRepemployees, na.rm = T),
median = wtd.quantile(income_annual_all, w = NatRepemployees, probs = c(.5), na.rm = T),
min = wtd.quantile(income_annual_all, w = NatRepemployees, probs = c(0), na.rm = T),
max = wtd.quantile(income_annual_all, w = NatRepemployees, probs = c(1), na.rm = T),
stdev = sqrt(wtd.var(income_annual_all, w = NatRepemployees, na.rm = T))
)
knitr::kable(income_statistics,
digits = 2,
col.names = c("Outsourcing group",
"Mean",
"Median",
"Min",
"Max",
"Standard dev.")) %>%
kable_styling(full_width = F)| Outsourcing group | Mean | Median | Min | Max | Standard dev. |
|---|---|---|---|---|---|
| Not outsourced | 27174.96 | 25200.5 | 2000 | 68000 | 13400.97 |
| Outsourced | 25104.04 | 24000.0 | 2400 | 68000 | 13090.35 |
readr::write_csv(income_statistics, file="../outputs/data/income_stats.csv")# plot the distribution of income for the two groups
data %>%
filter(income_drop_all == 0 & !is.na(income_annual_all)) %>%
ggplot(., aes(outsourcing_status, income_annual_all)) +
geom_violin() +
geom_boxplot(width = 0.3) +
geom_text(inherit.aes=F, data=income_statistics, aes(outsourcing_status, y = 6e+04), label=paste0("Mean = ", round(income_statistics$mean,0),"\n", "Median = ", income_statistics$median), nudge_x = 0.1, hjust=0) +
coord_cartesian(xlim=c(1,2.5)) +
theme_minimal() +
xlab("Outsourcing status") + ylab("Annual income") +
coord_cartesian(ylim = c(plyr::round_any(min(income_statistics$min), 5000, f = floor),plyr::round_any(max(income_statistics$max),5000, f = ceiling))) +
scale_y_continuous(breaks = seq(plyr::round_any(min(income_statistics$min), 5000, f = ceiling), plyr::round_any(max(income_statistics$max),5000, f = ceiling), 10000))The distribution for the different outsourcing groups is shown below. It indicates that income is particularly low for the ‘outsourced’ and ‘likely agency’ workers, whilst average income for the ‘high indicators’ workers is notably higher. This means that, were we not to consider the high indicators group, the difference in income between outsrouced and non-outsourced workers would be larger.
income_statistics <- data %>%
filter(income_drop_all == 0 & !is.na(income_annual_all)) %>%
group_by(outsourcing_group) %>%
summarise(
n = n(),
mean = weighted.mean(income_annual_all, w = NatRepemployees, na.rm = T),
median = wtd.quantile(income_annual_all, w = NatRepemployees, probs = c(.5), na.rm = T),
min = wtd.quantile(income_annual_all, w = NatRepemployees, probs = c(0), na.rm = T),
max = wtd.quantile(income_annual_all, w = NatRepemployees, probs = c(1), na.rm = T),
stdev = sqrt(wtd.var(income_annual_all, w = NatRepemployees, na.rm = T))
)
data %>%
filter(income_drop_all == 0 & !is.na(income_annual_all)) %>%
ggplot(., aes(outsourcing_group, income_annual_all)) +
geom_violin() +
geom_boxplot(width = 0.3) +
geom_text(inherit.aes=F, data=income_statistics, aes(outsourcing_group, y = 6e+04), label=paste0("Mean = ", round(income_statistics$mean,0),"\n", "Median = ", round(income_statistics$median,0),"\n N = ", income_statistics$n), nudge_x = 0.1, hjust=0) +
coord_cartesian(xlim=c(1,2.5)) +
theme_minimal() +
xlab("Outsourcing group") + ylab("Annual income") +
coord_cartesian(ylim = c(plyr::round_any(min(income_statistics$min), 5000, f = floor),plyr::round_any(max(income_statistics$max),5000, f = ceiling))) +
scale_y_continuous(breaks = seq(plyr::round_any(min(income_statistics$min), 5000, f = ceiling), plyr::round_any(max(income_statistics$max),5000, f = ceiling), 10000))temp_data <- data %>%
filter(income_drop == 0 & !is.na(income_annual))
# ttest <- t.test(temp_data[which(temp_data$outsourcing_status=="Outsourced"),"income_annual"],
# temp_data[which(temp_data$outsourcing_status=="Not outsourced"),"income_annual"]
# )
ttest <- sjstats::weighted_ttest(income_annual ~ outsourcing_status + NatRepemployees, temp_data)
#cohens_d(ttest)Although the average pay between non-outsourced and outsourced workers looks similar, a t-test finds that there is a marginally significant difference; outsourced workers are on average paid less than non-outsourced workers (t(1511.07) = 3.97, p = 0).
Below, we run a linear regression testing whether the relationship between outsourcing status and annual income is influenced by income group (not low vs low), controlling for age, gender, ethnicity, and region. We do indeed find a significant interaction effect. The figure below plots this.
test <- lm(income_annual ~ Age + Gender + Ethnicity + Region + outsourcing_status*income_group, data, weights = NatRepemployees)
# summary(test)
emmeans(test, specs = "outsourcing_status", by = "income_group")income_group = Not low:
outsourcing_status emmean SE df lower.CL upper.CL
Not outsourced 62912 13148 7360 37139 88686
Outsourced 90189 13564 7360 63598 116779
income_group = Low:
outsourcing_status emmean SE df lower.CL upper.CL
Not outsourced 23036 13200 7360 -2840 48912
Outsourced 21766 13846 7360 -5375 48908
Results are averaged over the levels of: Gender, Region
Confidence level used: 0.95
sjPlot::plot_model(test, type = "int")The results here indicate that among workers that are not paid below our low pay threshold, an outsourced worker can typically be expected to earn considerably more than a non-outsourced worker, whereas among workers that are paid below our low pay threshold, an outsourced worker can typically be expected to be paid the same as a non-outsourced worker (maybe slightly less). Of note is that for both pay groups, the variance in pay is higher for the outsourced groups.
Comparison of Majorgroupcode indicates that a higher proportion of outsourced people work in Elementary Occupations, compared to non-outsourced people. A lower proportion of outsourced people work in administrative and secretarial occupations, associate professional occupations and professional occupations.
mgc_summary <- data %>%
group_by(outsourcing_status,Majorgroupcode) %>%
summarise(
Frequency = sum(NatRepemployees),
avg_income = mean(income_annual, na.rm=T),
wtd_avg_income = weighted.mean(income_annual, w = NatRepemployees, na.rm=T)
) %>%
mutate(
Sum = sum(Frequency),
perc = 100 * (Frequency/Sum)
)
readr::write_csv(mgc_summary, "../outputs/data/majorgroupcode_summary.csv")mgc_summary %>%
ggplot(aes(outsourcing_status, perc, fill = as.factor(Majorgroupcode))) +
geom_col() +
coord_flip() +
scale_fill_manual(values=many_colours)mgc_key <- data.frame("number" = seq(1,11,1),
"Major group code" = c( levels(haven::as_factor(mgc_summary$Majorgroupcode)),NA))
mgc_key %>%
kable() %>%
kable_styling(full_width = F)| number | Major.group.code |
|---|---|
| 1 | ADMINISTRATIVE AND SECRETARIAL OCCUPATIONS |
| 2 | ASSOCIATE PROFESSIONAL OCCUPATIONS |
| 3 | CARING, LEISURE AND OTHER SERVICE OCCUPATIONS |
| 4 | ELEMENTARY OCCUPATIONS |
| 5 | MANAGERS, DIRECTORS AND SENIOR OFFICIALS |
| 6 | NA |
| 7 | PROCESS, PLANT AND MACHINE OPERATIVES |
| 8 | PROFESSIONAL OCCUPATIONS |
| 9 | SALES AND CUSTOMER SERVICE OCCUPATIONS |
| 10 | SKILLED TRADES OCCUPATIONS |
| 11 | NA |
The table below shows the percentage of outsourced and non-outsourced workers in each majorgroupcode, as well as the difference between them (positive numbers in the difference column indicate occupations that are more common for outsourced work, negative numbers indicate occupations that are less common for outsourced work).3
mgc_summary %>%
select(outsourcing_status, Majorgroupcode, perc) %>%
pivot_wider(names_from = outsourcing_status, values_from = perc) %>%
mutate(
Majorgroupcode = haven::as_factor(Majorgroupcode),
Difference = `Outsourced` - `Not outsourced`
) %>%
kable(digits = 2) %>%
kable_styling(full_width = F)| Majorgroupcode | Not outsourced | Outsourced | Difference |
|---|---|---|---|
| ADMINISTRATIVE AND SECRETARIAL OCCUPATIONS | 14.09 | 10.23 | -3.85 |
| ASSOCIATE PROFESSIONAL OCCUPATIONS | 13.23 | 9.35 | -3.88 |
| CARING, LEISURE AND OTHER SERVICE OCCUPATIONS | 8.85 | 10.00 | 1.14 |
| ELEMENTARY OCCUPATIONS | 8.84 | 17.12 | 8.28 |
| MANAGERS, DIRECTORS AND SENIOR OFFICIALS | 11.07 | 12.23 | 1.16 |
| NA | 0.41 | 0.29 | -0.12 |
| PROCESS, PLANT AND MACHINE OPERATIVES | 6.19 | 6.99 | 0.80 |
| PROFESSIONAL OCCUPATIONS | 19.92 | 17.57 | -2.35 |
| SALES AND CUSTOMER SERVICE OCCUPATIONS | 12.06 | 10.97 | -1.10 |
| SKILLED TRADES OCCUPATIONS | 5.32 | 5.24 | -0.07 |
| NA | 0.01 | NA | NA |
mgc_summary_2 <- data %>%
group_by(Majorgroupcode,outsourcing_status) %>%
summarise(
Frequency = sum(NatRepemployees),
avg_income = mean(income_annual, na.rm=T),
wtd_avg_income = weighted.mean(income_annual, w = NatRepemployees, na.rm=T)
) %>%
mutate(
Sum = sum(Frequency),
perc = 100 * (Frequency/Sum)
) %>%
drop_na()
mgc_summary_2_wider <- mgc_summary_2 %>%
select(outsourcing_status, Majorgroupcode, perc) %>%
pivot_wider(names_from = outsourcing_status, values_from = perc) %>%
mutate(
Majorgroupcode = haven::as_factor(Majorgroupcode),
#Difference = `Outsourced` - `Not outsourced`,
Ratio_10 = 10 * (`Outsourced` / `Not outsourced`) # indicates how many outsourced workers there are for every 10 non outsourced
)
mgc_summary_2_wider %>%
kable(digits = 2) %>%
kable_styling(full_width = F)| Majorgroupcode | Not outsourced | Outsourced | Ratio_10 |
|---|---|---|---|
| ADMINISTRATIVE AND SECRETARIAL OCCUPATIONS | 87.19 | 12.81 | 1.47 |
| ASSOCIATE PROFESSIONAL OCCUPATIONS | 87.49 | 12.51 | 1.43 |
| CARING, LEISURE AND OTHER SERVICE OCCUPATIONS | 81.41 | 18.59 | 2.28 |
| ELEMENTARY OCCUPATIONS | 71.86 | 28.14 | 3.92 |
| MANAGERS, DIRECTORS AND SENIOR OFFICIALS | 81.73 | 18.27 | 2.24 |
| NA | 87.59 | 12.41 | 1.42 |
| PROCESS, PLANT AND MACHINE OPERATIVES | 81.41 | 18.59 | 2.28 |
| PROFESSIONAL OCCUPATIONS | 84.86 | 15.14 | 1.78 |
| SALES AND CUSTOMER SERVICE OCCUPATIONS | 84.47 | 15.53 | 1.84 |
| SKILLED TRADES OCCUPATIONS | 83.37 | 16.63 | 1.99 |
readr::write_csv(mgc_summary_2, "../outputs/data/majorgroupcode_summary_2_long.csv")
readr::write_csv(mgc_summary_2_wider, "../outputs/data/majorgroupcode_summary_2_wide.csv")The plot below summarises the average pay (x-axis) in each occupation (y-axis) for outsourced and non-outsourced workers (dot colour), as well as the size of the respective workforce (size of dots). Here the size of the dot represents the percentage of workers within the sector who are outsourced (blue) or not outsourced (purple).4
mgc_summary_2 %>%
mutate(
Majorgroupcode = haven::as_factor(Majorgroupcode)
) %>%
ggplot(., aes(Majorgroupcode, wtd_avg_income, size = perc, colour = outsourcing_status)) +
geom_point(position = "dodge") +
coord_flip() +
theme_minimal() +
theme(legend.position = "bottom",
legend.title = element_blank())+
scale_y_continuous(breaks=seq(0,max(mgc_summary$wtd_avg_income), 25000)) +
scale_colour_manual(values=colours) +
guides(size=FALSE) # remove size legend as gauging size is difficultIt shows, as might be expected, the size of the outsourced workforce for each sector is smaller than the non-outsourced workforce, but the ratio is not the same for all sectors. The sector with the largest non-outsourced:outsourced ratio is Elementary occupations; for every 10 non-outsourced workers, there are 4 outsourced workers. This is followed by caring, leisure, and other service occupations, and process, plant and machine operatives, both of which employ 2 outsourced workers for every 10 non-outsourced workers.
Notably, in elementary occupations and sales and customer service occupations, outsourced workers are on average paid more than non-outsourced workers. In contrast, workers in process, plant and machine operations are paid less if they are outsourced than if they are not outsourced.
A deep dive into elementary occupations, process occupations, and caring occupations reveals that there are differences between occupations in the size of the outsourced workforce and pay.
elem_summary <- data %>%
filter(Majorgroupcode %in% c(4)) %>%
group_by(UnitOccupation,outsourcing_status) %>%
summarise(
n = n(),
Frequency = sum(NatRepemployees),
avg_income = mean(income_annual, na.rm=T),
wtd_avg_income = weighted.mean(income_annual, w = NatRepemployees, na.rm=T)
) %>%
mutate(
Sum = sum(Frequency),
perc = 100 * (Frequency/Sum)
) %>%
drop_na()
elem_summary %>%
mutate(
UnitOccupation = haven::as_factor(UnitOccupation)
) %>%
ggplot(., aes(wtd_avg_income,UnitOccupation, size = perc, colour = outsourcing_status)) +
geom_point() +
geom_text_repel(inherit.aes = F, aes(wtd_avg_income, UnitOccupation, colour = outsourcing_status, label=paste0("n=",n))) +
#coord_flip() +
theme_minimal() +
theme(legend.position = "bottom",
legend.title = element_blank()) +
# scale_x_continuous(breaks=scales::pretty(10))
scale_colour_manual(values=colours) +
guides(size=FALSE) readr::write_csv(elem_summary, "../outputs/data/elementary_occs_summary.csv")proc_summary <- data %>%
filter(Majorgroupcode %in% c(7)) %>%
group_by(UnitOccupation,outsourcing_status) %>%
summarise(
n = n(),
Frequency = sum(NatRepemployees),
avg_income = mean(income_annual, na.rm=T),
wtd_avg_income = weighted.mean(income_annual, w = NatRepemployees, na.rm=T)
) %>%
mutate(
Sum = sum(Frequency),
perc = 100 * (Frequency/Sum)
) %>%
drop_na()
proc_summary %>%
mutate(
UnitOccupation = haven::as_factor(UnitOccupation)
) %>%
ggplot(., aes(wtd_avg_income,UnitOccupation, size = perc, colour = outsourcing_status)) +
geom_point() +
geom_text_repel(inherit.aes = F, aes(wtd_avg_income, UnitOccupation, colour = outsourcing_status, label=paste0("n=",n))) +
#coord_flip() +
theme_minimal() +
theme(legend.position = "bottom",
legend.title = element_blank()) +
# scale_x_continuous(breaks=scales::pretty(10))
scale_colour_manual(values=colours) +
guides(size=FALSE) readr::write_csv(proc_summary, "../outputs/data/process_occs_summary.csv")caring_summary <- data %>%
filter(Majorgroupcode %in% c(3)) %>%
group_by(UnitOccupation,outsourcing_status) %>%
summarise(
n = n(),
Frequency = sum(NatRepemployees),
avg_income = mean(income_annual, na.rm=T),
wtd_avg_income = weighted.mean(income_annual, w = NatRepemployees, na.rm=T)
) %>%
mutate(
Sum = sum(Frequency),
perc = 100 * (Frequency/Sum)
) %>%
drop_na()
caring_summary %>%
mutate(
UnitOccupation = haven::as_factor(UnitOccupation)
) %>%
ggplot(., aes(wtd_avg_income,UnitOccupation, size = perc, colour = outsourcing_status)) +
geom_point() +
geom_text_repel(inherit.aes = F, aes(wtd_avg_income, UnitOccupation, colour = outsourcing_status, label=paste0("n=",n))) +
#coord_flip() +
theme_minimal() +
theme(legend.position = "bottom",
legend.title = element_blank()) +
# scale_x_continuous(breaks=scales::pretty(10))
scale_colour_manual(values=colours) +
guides(size=FALSE) readr::write_csv(caring_summary, "../outputs/data/caring_occs_summary.csv")This framing shows how outsourced and non-outsourced workers are distributed across sectors.
sector_summary <- data %>%
group_by(outsourcing_status, SectorName, SectorName_labelled) %>%
summarise(
Frequency = sum(NatRepemployees),
avg_income = mean(income_annual, na.rm=T),
wtd_avg_income = weighted.mean(income_annual, w = NatRepemployees, na.rm=T)
) %>%
ungroup() %>%
group_by(outsourcing_status) %>%
mutate(
Sum = sum(Frequency),
perc = 100 * (Frequency/Sum),
SectorName_labelled = case_when(SectorName_labelled == "NA" ~ NA,
TRUE ~ SectorName_labelled),
SectorName_short = SectorName_labelled
) %>%
# make the sector names more readable
separate_wider_delim(SectorName_short, names = c("SectorName_short", "SectorName_short_detail"), delim=";",
too_few = "align_start") %>%
mutate(
SectorName_short = factor(stringr::str_to_sentence(SectorName_short)),
SectorName_short_detail = factor(stringr::str_to_sentence(SectorName_short_detail)),
)
readr::write_csv(sector_summary, "../outputs/data/sector_summary.csv")The plot below shows the sector breakdown by outsourcing status. I.e. this is how outsourced and not outsourced workers are distributed across sectors.8
With this framing we could say things like “as an outsourced worker, you are x times more likely to work in
plot_data <- sector_summary %>%
drop_na(SectorName_labelled) %>%
droplevels()
plot_data %>%
ggplot(aes(outsourcing_status, perc, fill = as.factor(SectorName))) +
geom_col() +
coord_flip() +
scale_fill_manual(values=many_colours)sector_key <- data.frame("number" = seq(1,length(unique(plot_data$SectorName)),1),
"Sector" = levels(plot_data$SectorName_labelled))
sector_key %>%
kable() %>%
kable_styling(full_width = F)| number | Sector |
|---|---|
| 1 | ACCOMMODATION AND FOOD SERVICE ACTIVITIES |
| 2 | ACTIVITIES OF EXTRATERRITORIAL ORGANISATIONS AND BODIES |
| 3 | ACTIVITIES OF HOUSEHOLDS AS EMPLOYERS; UNDIFFERENTIATED GOODS-AND SERVICES-PRODUCING ACTIVITIES OF HOUSEHOLDS FOR OWN US |
| 4 | ADMINISTRATIVE AND SUPPORT SERVICE ACTIVITIES |
| 5 | AGRICULTURE, FORESTRY AND FISHING |
| 6 | ARTS, ENTERTAINMENT AND RECREATION |
| 7 | CONSTRUCTION |
| 8 | EDUCATION |
| 9 | ELECTRICITY, GAS, STEAM AND AIR CONDITIONING SUPPLY |
| 10 | FINANCIAL AND INSURANCE ACTIVITIES |
| 11 | HUMAN HEALTH AND SOCIAL WORK ACTIVITIES |
| 12 | INFORMATION AND COMMUNICATION |
| 13 | MANUFACTURING |
| 14 | MINING AND QUARRYING |
| 15 | Not found |
| 16 | OTHER SERVICE ACTIVITIES |
| 17 | PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES |
| 18 | PUBLIC ADMINISTRATION AND DEFENCE; COMPULSORY SOCIAL SECURITY |
| 19 | REAL ESTATE ACTIVITIES |
| 20 | TRANSPORTATION AND STORAGE |
| 21 | WATER SUPPLY; SEWERAGE, WASTE MANAGEMENT AND REMEDIATION ACTIVITIES |
| 22 | WHOLESALE AND RETAIL TRADE; REPAIR OF MOTOR VEHICLES AND MOTORCYCLES |
The table below shows what percentage of outsourced and non-outsourced workers work in each sector, as well as the difference between them (positive numbers in the difference column indicate sectors that are more common for outsourced work, negative numbers indicate sectors that are less common for outsourced work).9
It indicates that sectors that are less common for outsourced workers compared to not outsourced are:
And sectors that are more common for outsourced workers compared to not outsourced are:
For these sectors that differ most in the concentration of the outsourced workforce, there is a pattern (if three data points can be called that) whereby in sectors with a relatively higher concentration of outsourced workers, outsourced workers are paid less, and in sectors with a relatively lower concentration of outsourced workers, outsourced workers are paid more. This is tenuous, but it is an example of the heterogeneity in income between sectors, and should be explored further.
sector_summary_2 <- plot_data %>%
select(outsourcing_status, SectorName, SectorName_short, perc, wtd_avg_income) %>%
pivot_wider(names_from = outsourcing_status, values_from = c(perc, wtd_avg_income)) %>%
mutate(
perc_difference = `perc_Outsourced` - `perc_Not outsourced`,
income_difference = `wtd_avg_income_Outsourced` - `wtd_avg_income_Not outsourced`
) %>%
relocate(
perc_difference, .after = perc_Outsourced
)
sector_summary_2 %>%
arrange(desc(abs(perc_difference))) %>%
kable(digits = 2) %>%
kable_styling(full_width = F)| SectorName | SectorName_short | perc_Not outsourced | perc_Outsourced | perc_difference | wtd_avg_income_Not outsourced | wtd_avg_income_Outsourced | income_difference |
|---|---|---|---|---|---|---|---|
| 19 | Public administration and defence | 7.98 | 4.41 | -3.57 | 44269.46 | 80688.11 | 36418.64 |
| 4 | Administrative and support service activities | 2.76 | 6.30 | 3.54 | 34008.23 | 25773.63 | -8234.59 |
| 8 | Education | 11.00 | 8.17 | -2.83 | 42664.00 | 66203.95 | 23539.95 |
| 13 | Manufacturing | 8.39 | 7.28 | -1.10 | 47496.16 | 61992.04 | 14495.88 |
| 11 | Human health and social work activities | 16.74 | 15.81 | -0.93 | 34943.28 | 37315.45 | 2372.18 |
| 21 | Transportation and storage | 4.86 | 5.79 | 0.93 | 50974.51 | 40399.44 | -10575.07 |
| 7 | Construction | 3.22 | 4.13 | 0.91 | 57729.30 | 79257.70 | 21528.41 |
| 17 | Other service activities | 2.77 | 3.66 | 0.89 | 39021.24 | 49283.90 | 10262.65 |
| 22 | Water supply | 0.76 | 1.62 | 0.85 | 46514.17 | 50316.51 | 3802.34 |
| 1 | Accommodation and food service activities | 5.89 | 6.71 | 0.83 | 23814.49 | 46561.01 | 22746.52 |
| 12 | Information and communication | 4.58 | 5.32 | 0.74 | 63987.91 | 97333.04 | 33345.13 |
| 18 | Professional, scientific and technical activities | 3.92 | 4.66 | 0.74 | 53888.29 | 69683.01 | 15794.72 |
| 3 | Activities of households as employers | 0.31 | 0.86 | 0.54 | 57350.92 | 21162.34 | -36188.59 |
| 23 | Wholesale and retail trade | 15.43 | 14.93 | -0.49 | 36426.50 | 59181.02 | 22754.52 |
| 16 | Not found | 0.30 | 0.68 | 0.38 | 30372.90 | 30421.16 | 48.26 |
| 20 | Real estate activities | 1.25 | 0.97 | -0.29 | 39630.78 | 27557.00 | -12073.78 |
| 6 | Arts, entertainment and recreation | 1.81 | 1.61 | -0.20 | 26930.44 | 21607.35 | -5323.09 |
| 5 | Agriculture, forestry and fishing | 0.42 | 0.22 | -0.20 | 36423.88 | 25767.70 | -10656.18 |
| 9 | Electricity, gas, steam and air conditioning supply | 1.21 | 1.32 | 0.10 | 84854.37 | 73130.20 | -11724.17 |
| 10 | Financial and insurance activities | 4.91 | 4.81 | -0.10 | 62097.41 | 143214.64 | 81117.23 |
| 2 | Activities of extraterritorial organisations and bodies | 0.04 | NA | NA | 32367.32 | NA | NA |
| 14 | Mining and quarrying | 0.10 | NA | NA | 54859.57 | NA | NA |
readr::write_csv(sector_summary_2, file="../outputs/data/sector_summary_2.csv")The plot below plots the percentage difference in the concentration of outsourced vs non-outsourced workers (i.e. the difference between what proportion of workers of each type are in each sector) against the income difference for that sector (i.e., the difference in the average income between groups). Note that a statistical test of this relationship shows it is non-significant. This plot therefore only serves as an illustration of where workers are situated in terms of sector and pay. A key takeaway here is that there is considerable variation in the difference in pay between outsourced and non-outsourced workers. There also appears to be a central area where the concentration of outsourced vs non-outsourced workers is quite similar, but the pay for outsourced workers is lower. This might indicate sectors where employment of outsourced workers is as common as employment of non-outsourced workers, but where outsourced workers are paid less than non-outsourced workers. These sectors are:
annotations <- data.frame(
xpos = c(-Inf,-Inf,Inf,Inf), # this sets to corners
ypos = c(-Inf, Inf,-Inf,Inf), # this sets to corners
annotateText = c("Outsourced concentration smaller, pay lower","Outsourced concentration smaller, pay higher",
"Outsourced concentration larger, pay lower","Outsourced concentration larger, pay higher"),
hjustvar = c(0,0,1,1), # higher values = right, lower values = left
vjustvar = c(-0.5,1,-0.5,1) # higher values = up, lower values = down
)
sector_summary_2 %>%
ggplot(aes(perc_difference, income_difference)) +
geom_point() +
geom_smooth(method="lm", colour=many_colours[2], se=FALSE) +
theme_minimal() +
ylab("Income difference in £ (out. - non-out.)") + xlab("Workforce proportion difference (out. - non-out)") +
geom_text_repel(aes(label=as.character(SectorName))) +
geom_text(inherit.aes = F, data = annotations, aes(xpos, ypos, label=annotateText, hjust=hjustvar, vjust=vjustvar)) +
geom_hline(yintercept = 0, colour="red",linetype="dashed")test <- lm(income_difference ~ perc_difference, sector_summary_2)
summary(test)
Call:
lm(formula = income_difference ~ perc_difference, data = sector_summary_2)
Residuals:
Min 1Q Median 3Q Max
-43309 -15858 -750 9840 70740
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9860 5442 1.812 0.0867 .
perc_difference -5043 3792 -1.330 0.2001
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 24330 on 18 degrees of freedom
(2 observations deleted due to missingness)
Multiple R-squared: 0.08948, Adjusted R-squared: 0.0389
F-statistic: 1.769 on 1 and 18 DF, p-value: 0.2001
This framing shows how sectors are composed, i.e., what proportion of workers in each sector are outsourced vs non-outsourced.
sector_summary_3 <- data %>%
group_by(SectorName, SectorName_labelled, outsourcing_status) %>%
summarise(
n = n(),
Frequency = sum(NatRepemployees),
avg_income = mean(income_annual, na.rm=T),
wtd_avg_income = weighted.mean(income_annual, w = NatRepemployees, na.rm=T)
) %>%
ungroup() %>%
group_by(SectorName) %>%
mutate(
N = sum(n),
Sum = sum(Frequency),
perc = 100 * (Frequency/Sum),
SectorName_labelled = case_when(SectorName_labelled == "NA" ~ NA,
TRUE ~ SectorName_labelled),
SectorName_short = SectorName_labelled
) %>%
# make the sector names more readable
separate_wider_delim(SectorName_short, names = c("SectorName_short", "SectorName_short_detail"), delim=";",
too_few = "align_start") %>%
mutate(
SectorName_short = factor(stringr::str_to_sentence(SectorName_short)),
SectorName_short_detail = factor(stringr::str_to_sentence(SectorName_short_detail)),
)
write_csv(sector_summary_3, file="../outputs/data/sector_summary_3.csv")The plot below shows the proportion of outsourced and not outsourced workers within each sector. I.e. this is showing what sectors have higher and lower proportions of outsourced workers.
With this framing we could say things like “sector a is x times more likely to employ outsourced workers than sector b”
plot_data <- sector_summary_3 %>%
drop_na(SectorName_short) %>%
droplevels()
# annotation_df <- plot_data %>%
# select(SectorName_short, outsourcing_status, perc, n
# mutate(
annotation_df <- plot_data %>%
filter(outsourcing_status == "Not outsourced") %>%
select(SectorName_short, N) %>%
mutate(
ypos = 80
)
ggplot(plot_data,aes(SectorName_short, perc, fill = outsourcing_status)) +
geom_col() +
geom_text(inherit.aes=F,data=annotation_df, aes(x=SectorName_short, y=ypos, label = paste0("N = ", N)), hjust=1, nudge_y = 15) +
coord_flip() +
scale_fill_manual(values=many_colours) +
scale_y_continuous(breaks=seq(0,100,10))# sector_key <- data.frame("number" = seq(1,length(unique(plot_data$SectorName_labelled)),1),
# "Sector" = levels(plot_data$SectorName_labelled))
#
# sector_key %>%
# kable() %>%
# kable_styling(full_width = F)The table below shows the percentage of outsourced workers in each Sector, ordered descending by percentage. It shows that the top three Sectors with the highest proportion of outsourced workers are:
Note that for an undefined sector (‘Not found’) contained one of the largest proportions of outsourced workers (31% of workers in the ‘Not found’ category were outsourced).
plot_data %>%
filter(outsourcing_status == "Outsourced") %>%
arrange(desc(perc)) %>%
select(SectorName_labelled, perc) %>%
kable() %>%
kable_styling()| SectorName | SectorName_labelled | perc |
|---|---|---|
| 3 | ACTIVITIES OF HOUSEHOLDS AS EMPLOYERS; UNDIFFERENTIATED GOODS-AND SERVICES-PRODUCING ACTIVITIES OF HOUSEHOLDS FOR OWN US | 35.652378 |
| 4 | ADMINISTRATIVE AND SUPPORT SERVICE ACTIVITIES | 31.570055 |
| 16 | Not found | 31.317619 |
| 22 | WATER SUPPLY; SEWERAGE, WASTE MANAGEMENT AND REMEDIATION ACTIVITIES | 30.008923 |
| 17 | OTHER SERVICE ACTIVITIES | 21.102417 |
| 7 | CONSTRUCTION | 20.589291 |
| 21 | TRANSPORTATION AND STORAGE | 19.415919 |
| 18 | PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES | 19.369714 |
| 12 | INFORMATION AND COMMUNICATION | 19.034279 |
| 1 | ACCOMMODATION AND FOOD SERVICE ACTIVITIES | 18.738635 |
| 9 | ELECTRICITY, GAS, STEAM AND AIR CONDITIONING SUPPLY | 18.008232 |
| 10 | FINANCIAL AND INSURANCE ACTIVITIES | 16.529705 |
| 23 | WHOLESALE AND RETAIL TRADE; REPAIR OF MOTOR VEHICLES AND MOTORCYCLES | 16.373830 |
| 11 | HUMAN HEALTH AND SOCIAL WORK ACTIVITIES | 16.037739 |
| 6 | ARTS, ENTERTAINMENT AND RECREATION | 15.255060 |
| 13 | MANUFACTURING | 14.939669 |
| 20 | REAL ESTATE ACTIVITIES | 13.504099 |
| 8 | EDUCATION | 13.065534 |
| 19 | PUBLIC ADMINISTRATION AND DEFENCE; COMPULSORY SOCIAL SECURITY | 10.051123 |
| 5 | AGRICULTURE, FORESTRY AND FISHING | 9.709408 |
Exploring this workforce makeup in the context of income shows that there are some sectors where outsourced workers are paid more and some where they are paid less than non-outsourced workers. The plot below visualises this.
Sectors where outsourced workers are paid less:
Sectors where outsourced workers are paid more:
Note that in 2 or 3 of the Sectors where outsourced workers are paid less are low-paying Sectors. (this needs to be double-checked)
annotation_df <- sector_summary_3 %>%
filter(outsourcing_status == "Not outsourced") %>%
select(SectorName_short, N) %>%
group_by(SectorName_short) %>%
summarise(
N = sum(N)
) %>%
mutate(
ypos = 110000
)
sector_summary_3 %>%
# mutate(
# SectorName = as.factor(SectorName)
# ) %>%
ggplot(., aes(wtd_avg_income,SectorName_short, size = perc, colour = outsourcing_status)) +
geom_point(position = "dodge") +
theme_minimal() +
theme(legend.position = "bottom",
legend.title = element_blank())+
#coord_flip() +
scale_x_continuous(breaks=seq(0,max(sector_summary$wtd_avg_income), 25000)) +
scale_colour_manual(values=colours) +
geom_text(inherit.aes=F,data=annotation_df, aes(x=ypos, y=SectorName_short, label = paste0("N = ", N)), hjust=1, nudge_x=20000) +
guides(size=FALSE) # remove size legend as gauging size is difficultsector_summary_paysplit <- data %>%
group_by(SectorName, SectorName_labelled, income_group, outsourcing_status) %>%
drop_na(income_group) %>%
summarise(
n = n(),
Frequency = sum(NatRepemployees),
avg_income = mean(income_annual, na.rm=T),
wtd_avg_income = weighted.mean(income_annual, w = NatRepemployees, na.rm=T)
) %>%
ungroup() %>%
group_by(SectorName, income_group) %>%
mutate(
N = sum(n),
Sum = sum(Frequency),
perc = 100 * (Frequency/Sum),
SectorName_labelled = case_when(SectorName_labelled == "NA" ~ NA,
TRUE ~ SectorName_labelled),
SectorName_short = SectorName_labelled
) %>%
# make the sector names more readable
separate_wider_delim(SectorName_short, names = c("SectorName_short", "SectorName_short_detail"), delim=";",
too_few = "align_start") %>%
mutate(
SectorName_short = factor(stringr::str_to_sentence(SectorName_short)),
SectorName_short_detail = factor(stringr::str_to_sentence(SectorName_short_detail)),
)
write_csv(sector_summary_paysplit, file="../outputs/sector_summary_paysplit.csv")for(group in unique(sector_summary_paysplit$income_group)){
plot_data <- sector_summary_paysplit %>%
filter(income_group==group) %>%
drop_na(SectorName_short) %>%
droplevels()
# set ypos for labels
max_value <- max(plot_data$perc)
ypos <- 0.8 * max_value
annotation_df <- plot_data %>%
filter(outsourcing_status == "Not outsourced") %>%
select(SectorName_short, N) %>%
group_by(SectorName_short) %>%
mutate(
ypos = ypos
)
plot <- ggplot(plot_data, aes(SectorName_short, perc, fill = outsourcing_status)) +
geom_col() +
geom_text(inherit.aes=F,data=annotation_df, aes(x=SectorName_short, y=ypos, label = paste0("N = ", N)), hjust=1, nudge_y = 15) +
coord_flip() +
scale_fill_manual(values=many_colours) +
scale_y_continuous(breaks=seq(0,100,10)) +
ggtitle(paste0(group, " income"))
print(plot)
}The percentages below show be read as e.g. 20% of low paid workers in accommodation and food services are outsourced, compared to 16% of not low paid workers in accommodation and food services.
plot_data <- sector_summary_paysplit %>%
filter(outsourcing_status=="Outsourced")
max_value <- max(plot_data$perc)
ypos <- 0.8 * max_value
annotation_df <- plot_data %>%
filter(outsourcing_status == "Outsourced") %>%
select(SectorName_short, income_group, N) %>%
group_by(SectorName_short) %>%
mutate(
ypos = ypos
)
ggplot(plot_data, aes(SectorName_short, perc, fill = income_group)) +
geom_col(position = position_dodge2()) +
# geom_text(inherit.aes=F,data=annotation_df, aes(x=SectorName_short, y=ypos, label = paste0("N = ", N)), position = position_dodge(width=2)) +
coord_flip() +
scale_fill_manual(values=many_colours) +
scale_y_continuous(breaks=seq(0,50,10)) +
ggtitle("Percentage of outsourced workers within\neach sector by income group")for(group in unique(sector_summary_paysplit$income_group)){
# set ypos for labels
max_income <- max(sector_summary_paysplit$wtd_avg_income[which(sector_summary_paysplit$income_group==group)])
ypos <- 0.8 * max_income
annotation_df <- sector_summary_paysplit %>%
filter(income_group==group) %>%
filter(outsourcing_status == "Not outsourced") %>%
select(SectorName_short, N) %>%
group_by(SectorName_short) %>%
summarise(
N = sum(N)
) %>%
mutate(
ypos = ypos
)
plot <- sector_summary_paysplit %>%
filter(income_group == group) %>%
ggplot(., aes(wtd_avg_income,SectorName_short, size = perc, colour = outsourcing_status)) +
geom_point(position = "dodge") +
theme_minimal() +
theme(legend.position = "bottom",
legend.title = element_blank())+
#coord_flip() +
scale_x_continuous(breaks=seq(0,max_income, plyr::round_any(max_income/5, 100))) +
scale_colour_manual(values=colours) +
geom_text(inherit.aes=F,data=annotation_df, aes(x=ypos, y=SectorName_short, label = paste0("N = ", N)), hjust=1, nudge_x=20000) +
guides(size=FALSE) + # remove size legend as gauging size is difficult
ggtitle(paste0(group, " income"))
print(plot)
}sector_summary <- data %>%
group_by(outsourcing_group, SectorName, SectorName_labelled) %>%
summarise(
n = n(),
Frequency = sum(NatRepemployees),
avg_income = mean(income_annual, na.rm=T),
wtd_avg_income = weighted.mean(income_annual, w = NatRepemployees, na.rm=T)
) %>%
ungroup() %>%
group_by(outsourcing_group) %>%
mutate(
N = sum(n),
Sum = sum(Frequency),
perc = 100 * (Frequency/Sum),
SectorName_labelled = case_when(SectorName_labelled == "NA" ~ NA,
TRUE ~ SectorName_labelled),
SectorName_short = SectorName_labelled
) %>%
# make the sector names more readable
separate_wider_delim(SectorName_short, names = c("SectorName_short", "SectorName_short_detail"), delim=";",
too_few = "align_start") %>%
mutate(
SectorName_short = factor(stringr::str_to_sentence(SectorName_short)),
SectorName_short_detail = factor(stringr::str_to_sentence(SectorName_short_detail)),
)The plot below shows the distribution of sectors within each outsourcing group. The standout differences are:
(Really, this plot is better for showing the makeup of each type of outsroucing group - comparisons aer better made comparing outsroucing group within sectors. Here is a better way of interpeting these plots):
For the high indicator group, the sector with the largest proprotion of workers was WHOLESALE AND RETAIL TRADE; REPAIR OF MOTOR VEHICLES AND MOTORCYCLES, closely followed by HUMAN HEALTH AND SOCIAL WORK ACTIVITIES.
For the likely agency group, the sector with the largest proprotion of workers was HUMAN HEALTH AND SOCIAL WORK ACTIVITIES.
For the outsourced group, the sector with the largest proprotion of workers was WHOLESALE AND RETAIL TRADE; REPAIR OF MOTOR VEHICLES AND MOTORCYCLES, closely followed by HUMAN HEALTH AND SOCIAL WORK ACTIVITIES.
Note that also for the not outsourced group, the sector with the largest proprotion of workers was HUMAN HEALTH AND SOCIAL WORK ACTIVITIES, closely followed by WHOLESALE AND RETAIL TRADE; REPAIR OF MOTOR VEHICLES AND MOTORCYCLES.
[This might say something about the demographics of the people who were sampled in this survey]
plot_data <- sector_summary %>%
drop_na(SectorName_labelled) %>%
droplevels()
plot_data %>%
ggplot(aes(outsourcing_group, perc, fill = as.factor(SectorName))) +
geom_col() +
coord_flip() +
scale_fill_manual(values=many_colours)sector_key <- data.frame("number" = seq(1,length(unique(plot_data$SectorName)),1),
"Sector" = unique(plot_data$SectorName_labelled))
sector_key %>%
kable() %>%
kable_styling(full_width = F)| number | Sector |
|---|---|
| 1 | ACCOMMODATION AND FOOD SERVICE ACTIVITIES |
| 2 | ACTIVITIES OF EXTRATERRITORIAL ORGANISATIONS AND BODIES |
| 3 | ACTIVITIES OF HOUSEHOLDS AS EMPLOYERS; UNDIFFERENTIATED GOODS-AND SERVICES-PRODUCING ACTIVITIES OF HOUSEHOLDS FOR OWN US |
| 4 | ADMINISTRATIVE AND SUPPORT SERVICE ACTIVITIES |
| 5 | AGRICULTURE, FORESTRY AND FISHING |
| 6 | ARTS, ENTERTAINMENT AND RECREATION |
| 7 | CONSTRUCTION |
| 8 | EDUCATION |
| 9 | ELECTRICITY, GAS, STEAM AND AIR CONDITIONING SUPPLY |
| 10 | FINANCIAL AND INSURANCE ACTIVITIES |
| 11 | HUMAN HEALTH AND SOCIAL WORK ACTIVITIES |
| 12 | INFORMATION AND COMMUNICATION |
| 13 | MANUFACTURING |
| 14 | MINING AND QUARRYING |
| 15 | Not found |
| 16 | OTHER SERVICE ACTIVITIES |
| 17 | PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES |
| 18 | PUBLIC ADMINISTRATION AND DEFENCE; COMPULSORY SOCIAL SECURITY |
| 19 | REAL ESTATE ACTIVITIES |
| 20 | TRANSPORTATION AND STORAGE |
| 21 | WATER SUPPLY; SEWERAGE, WASTE MANAGEMENT AND REMEDIATION ACTIVITIES |
| 22 | WHOLESALE AND RETAIL TRADE; REPAIR OF MOTOR VEHICLES AND MOTORCYCLES |
plot_data %>%
select(outsourcing_group, SectorName, SectorName_labelled, perc) %>%
group_by(outsourcing_group) %>%
arrange(desc(perc), .by_group=TRUE) %>%
kable() %>%
kable_styling(full_width = F)| outsourcing_group | SectorName | SectorName_labelled | perc |
|---|---|---|---|
| Not outsourced | 11 | HUMAN HEALTH AND SOCIAL WORK ACTIVITIES | 16.7360613 |
| Not outsourced | 23 | WHOLESALE AND RETAIL TRADE; REPAIR OF MOTOR VEHICLES AND MOTORCYCLES | 15.4267755 |
| Not outsourced | 8 | EDUCATION | 10.9976142 |
| Not outsourced | 13 | MANUFACTURING | 8.3872365 |
| Not outsourced | 19 | PUBLIC ADMINISTRATION AND DEFENCE; COMPULSORY SOCIAL SECURITY | 7.9832203 |
| Not outsourced | 1 | ACCOMMODATION AND FOOD SERVICE ACTIVITIES | 5.8872856 |
| Not outsourced | 10 | FINANCIAL AND INSURANCE ACTIVITIES | 4.9095444 |
| Not outsourced | 21 | TRANSPORTATION AND STORAGE | 4.8584753 |
| Not outsourced | 12 | INFORMATION AND COMMUNICATION | 4.5805150 |
| Not outsourced | 18 | PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES | 3.9235934 |
| Not outsourced | 7 | CONSTRUCTION | 3.2214668 |
| Not outsourced | 17 | OTHER SERVICE ACTIVITIES | 2.7676269 |
| Not outsourced | 4 | ADMINISTRATIVE AND SUPPORT SERVICE ACTIVITIES | 2.7603460 |
| Not outsourced | 6 | ARTS, ENTERTAINMENT AND RECREATION | 1.8053220 |
| Not outsourced | 20 | REAL ESTATE ACTIVITIES | 1.2521925 |
| Not outsourced | 9 | ELECTRICITY, GAS, STEAM AND AIR CONDITIONING SUPPLY | 1.2117371 |
| Not outsourced | 22 | WATER SUPPLY; SEWERAGE, WASTE MANAGEMENT AND REMEDIATION ACTIVITIES | 0.7627515 |
| Not outsourced | 5 | AGRICULTURE, FORESTRY AND FISHING | 0.4225971 |
| Not outsourced | 3 | ACTIVITIES OF HOUSEHOLDS AS EMPLOYERS; UNDIFFERENTIATED GOODS-AND SERVICES-PRODUCING ACTIVITIES OF HOUSEHOLDS FOR OWN US | 0.3123761 |
| Not outsourced | 16 | Not found | 0.3030015 |
| Not outsourced | 14 | MINING AND QUARRYING | 0.1034233 |
| Not outsourced | 2 | ACTIVITIES OF EXTRATERRITORIAL ORGANISATIONS AND BODIES | 0.0431916 |
| Outsourced | 23 | WHOLESALE AND RETAIL TRADE; REPAIR OF MOTOR VEHICLES AND MOTORCYCLES | 14.8186202 |
| Outsourced | 11 | HUMAN HEALTH AND SOCIAL WORK ACTIVITIES | 14.1182403 |
| Outsourced | 8 | EDUCATION | 8.6944943 |
| Outsourced | 13 | MANUFACTURING | 7.1682245 |
| Outsourced | 4 | ADMINISTRATIVE AND SUPPORT SERVICE ACTIVITIES | 7.1400736 |
| Outsourced | 1 | ACCOMMODATION AND FOOD SERVICE ACTIVITIES | 6.7289225 |
| Outsourced | 21 | TRANSPORTATION AND STORAGE | 5.5518952 |
| Outsourced | 12 | INFORMATION AND COMMUNICATION | 5.2270278 |
| Outsourced | 18 | PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES | 5.1295921 |
| Outsourced | 10 | FINANCIAL AND INSURANCE ACTIVITIES | 4.8852436 |
| Outsourced | 19 | PUBLIC ADMINISTRATION AND DEFENCE; COMPULSORY SOCIAL SECURITY | 4.6701148 |
| Outsourced | 17 | OTHER SERVICE ACTIVITIES | 4.0418607 |
| Outsourced | 7 | CONSTRUCTION | 3.6893071 |
| Outsourced | 22 | WATER SUPPLY; SEWERAGE, WASTE MANAGEMENT AND REMEDIATION ACTIVITIES | 1.7774160 |
| Outsourced | 6 | ARTS, ENTERTAINMENT AND RECREATION | 1.6604915 |
| Outsourced | 9 | ELECTRICITY, GAS, STEAM AND AIR CONDITIONING SUPPLY | 1.2565601 |
| Outsourced | 3 | ACTIVITIES OF HOUSEHOLDS AS EMPLOYERS; UNDIFFERENTIATED GOODS-AND SERVICES-PRODUCING ACTIVITIES OF HOUSEHOLDS FOR OWN US | 0.9852412 |
| Outsourced | 20 | REAL ESTATE ACTIVITIES | 0.8869119 |
| Outsourced | 16 | Not found | 0.6672183 |
| Outsourced | 5 | AGRICULTURE, FORESTRY AND FISHING | 0.2801044 |
| Likely agency | 11 | HUMAN HEALTH AND SOCIAL WORK ACTIVITIES | 22.3782074 |
| Likely agency | 23 | WHOLESALE AND RETAIL TRADE; REPAIR OF MOTOR VEHICLES AND MOTORCYCLES | 11.7348709 |
| Likely agency | 1 | ACCOMMODATION AND FOOD SERVICE ACTIVITIES | 9.5233357 |
| Likely agency | 13 | MANUFACTURING | 7.7512619 |
| Likely agency | 7 | CONSTRUCTION | 6.8292108 |
| Likely agency | 8 | EDUCATION | 5.9626362 |
| Likely agency | 21 | TRANSPORTATION AND STORAGE | 5.5687150 |
| Likely agency | 12 | INFORMATION AND COMMUNICATION | 5.1177601 |
| Likely agency | 4 | ADMINISTRATIVE AND SUPPORT SERVICE ACTIVITIES | 5.0349824 |
| Likely agency | 19 | PUBLIC ADMINISTRATION AND DEFENCE; COMPULSORY SOCIAL SECURITY | 4.9276124 |
| Likely agency | 10 | FINANCIAL AND INSURANCE ACTIVITIES | 4.2492270 |
| Likely agency | 18 | PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES | 2.5851476 |
| Likely agency | 17 | OTHER SERVICE ACTIVITIES | 1.8324543 |
| Likely agency | 6 | ARTS, ENTERTAINMENT AND RECREATION | 1.6318044 |
| Likely agency | 22 | WATER SUPPLY; SEWERAGE, WASTE MANAGEMENT AND REMEDIATION ACTIVITIES | 1.2974489 |
| Likely agency | 9 | ELECTRICITY, GAS, STEAM AND AIR CONDITIONING SUPPLY | 1.1617726 |
| Likely agency | 3 | ACTIVITIES OF HOUSEHOLDS AS EMPLOYERS; UNDIFFERENTIATED GOODS-AND SERVICES-PRODUCING ACTIVITIES OF HOUSEHOLDS FOR OWN US | 0.9038069 |
| Likely agency | 16 | Not found | 0.7203230 |
| Likely agency | 20 | REAL ESTATE ACTIVITIES | 0.5694658 |
| Likely agency | 5 | AGRICULTURE, FORESTRY AND FISHING | 0.2199565 |
| High indicators | 23 | WHOLESALE AND RETAIL TRADE; REPAIR OF MOTOR VEHICLES AND MOTORCYCLES | 18.4511626 |
| High indicators | 11 | HUMAN HEALTH AND SOCIAL WORK ACTIVITIES | 16.5456358 |
| High indicators | 8 | EDUCATION | 8.1099485 |
| High indicators | 13 | MANUFACTURING | 7.3160094 |
| High indicators | 21 | TRANSPORTATION AND STORAGE | 6.9715309 |
| High indicators | 12 | INFORMATION AND COMMUNICATION | 5.9221553 |
| High indicators | 10 | FINANCIAL AND INSURANCE ACTIVITIES | 5.0133674 |
| High indicators | 18 | PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES | 4.6896460 |
| High indicators | 4 | ADMINISTRATIVE AND SUPPORT SERVICE ACTIVITIES | 4.0053150 |
| High indicators | 1 | ACCOMMODATION AND FOOD SERVICE ACTIVITIES | 3.9750691 |
| High indicators | 17 | OTHER SERVICE ACTIVITIES | 3.8158394 |
| High indicators | 7 | CONSTRUCTION | 3.3883241 |
| High indicators | 19 | PUBLIC ADMINISTRATION AND DEFENCE; COMPULSORY SOCIAL SECURITY | 2.8467744 |
| High indicators | 9 | ELECTRICITY, GAS, STEAM AND AIR CONDITIONING SUPPLY | 1.7075018 |
| High indicators | 20 | REAL ESTATE ACTIVITIES | 1.6732156 |
| High indicators | 6 | ARTS, ENTERTAINMENT AND RECREATION | 1.3609799 |
| High indicators | 22 | WATER SUPPLY; SEWERAGE, WASTE MANAGEMENT AND REMEDIATION ACTIVITIES | 1.2566393 |
| High indicators | 16 | Not found | 0.7135371 |
| High indicators | 3 | ACTIVITIES OF HOUSEHOLDS AS EMPLOYERS; UNDIFFERENTIATED GOODS-AND SERVICES-PRODUCING ACTIVITIES OF HOUSEHOLDS FOR OWN US | 0.2744929 |
13/05 MORE TO DO HERE FOR SECTOR AS ABOVE
ethnicity_statistics <- data %>%
group_by(outsourcing_status, Ethnicity_labelled) %>%
summarise(
n = n(), # count cases
Frequency = sum(NatRepemployees) # count weighted cases
) %>%
mutate(
N = sum(n),
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum),
Ethnicity_short = Ethnicity_labelled
) %>%
separate_wider_delim(Ethnicity_short,
names = c("Ethnicity_short", "Ethnicity detail"),
delim = stringr::regex(" / |, "), # use multiple delims
too_few = "align_start",
too_many = "merge")
readr::write_csv(ethnicity_statistics, file = "../outputs/data/ethnicity_stats_1.csv")ethnicities <- as.vector(unique(haven::as_factor(data$Ethnicity)))
non_white_ethnicities <- ethnicities[!(ethnicities %in% "English / Welsh / Scottish / Northern Irish / British")]
# Will throw NA warning. I think this OK but investigate how to avoid the problem
summary_table <- data %>%
mutate(
Ethnicity = haven::as_factor(Ethnicity)
) %>%
mutate(
Ethnicity = forcats::fct_collapse(as.character(Ethnicity),
"White British" = c("English / Welsh / Scottish / Northern Irish / British"),
"Non-White British" = non_white_ethnicities)
) %>%
group_by(outsourcing_status, Ethnicity) %>%
summarise(
n = n()
) %>%
mutate(
Sum = sum(n),
Percentage = 100 * (n / Sum)
)
group_1 <- t(tibble("present"=summary_table[which(summary_table["Ethnicity"]=="White British" &
summary_table["outsourcing_status"]=="Outsourced"),"n"],
"not present" = summary_table[which(summary_table["Ethnicity"]=="Non-White British" &
summary_table["outsourcing_status"]=="Outsourced"),"n"]
))
group_2 <- t(tibble("present"=summary_table[which(summary_table["Ethnicity"]=="White British" &
summary_table["outsourcing_status"]=="Not outsourced"),"n"],
"not present" = summary_table[which(summary_table["Ethnicity"]=="Non-White British" &
summary_table["outsourcing_status"]=="Not outsourced"),"n"]
))
comp_mat <- as.matrix(cbind(group_2, group_1)) # matrix for crosstable
x2 <- gmodels::CrossTable(comp_mat, fisher=TRUE)
# `r if(x2[["chisq"]][["p.value"]] < .001, "< .001", paste0("= ", round(x2[["chisq"]][["p.value"]],2)))`).
# (chi-square = `r round(x2[["chisq"]][["statistic"]][["X-squared"]],2)`, *p* = `r round(x2[["chisq"]][["p.value"]],3)`).Breaking down by ethnicity shows that the outsourced group has a lower proportion of White workers compared to the non-outsourced group. For example, in the outsourced group, the proportion of British (‘White’) workers is 66.91 %, compared to 78.01% in the not outsourced group. Needless to say, this means that there is a correspondingly higher proportion of workers from minority backgrounds in the outsourced group, notably from African (4.1%) and other White backgrounds (5.5, amongst others.12 These differences mean that outsourced workers are 1.87 times more likely to be a member of minority ethnicity than non-outsourced workers.
ethnicity_statistics %>%
# mutate(
# Ethnicity = haven::as_factor(Ethnicity)
# ) %>%
knitr::kable(.,digits = 2) %>%
kable_styling(full_width = F)| outsourcing_status | Ethnicity_labelled | n | Frequency | N | Sum | Percentage | Ethnicity_short | Ethnicity detail |
|---|---|---|---|---|---|---|---|---|
| Not outsourced | English / Welsh / Scottish / Northern Irish / British | 6623 | 6589.16 | 8472 | 8446.64 | 78.01 | English | Welsh / Scottish / Northern Irish / British |
| Not outsourced | Irish | 111 | 98.75 | 8472 | 8446.64 | 1.17 | Irish | NA |
| Not outsourced | Gypsy or Irish Traveller | 6 | 8.32 | 8472 | 8446.64 | 0.10 | Gypsy or Irish Traveller | NA |
| Not outsourced | Roma | 4 | 5.24 | 8472 | 8446.64 | 0.06 | Roma | NA |
| Not outsourced | Any other White background | 369 | 385.42 | 8472 | 8446.64 | 4.56 | Any other White background | NA |
| Not outsourced | White and Black Caribbean | 84 | 53.22 | 8472 | 8446.64 | 0.63 | White and Black Caribbean | NA |
| Not outsourced | White and Black African | 40 | 20.43 | 8472 | 8446.64 | 0.24 | White and Black African | NA |
| Not outsourced | White and Asian | 61 | 32.39 | 8472 | 8446.64 | 0.38 | White and Asian | NA |
| Not outsourced | Any other Mixed / Multiple ethnic background | 59 | 33.60 | 8472 | 8446.64 | 0.40 | Any other Mixed | Multiple ethnic background |
| Not outsourced | Indian | 170 | 237.76 | 8472 | 8446.64 | 2.81 | Indian | NA |
| Not outsourced | Pakistani | 93 | 96.09 | 8472 | 8446.64 | 1.14 | Pakistani | NA |
| Not outsourced | Bangladeshi | 47 | 53.46 | 8472 | 8446.64 | 0.63 | Bangladeshi | NA |
| Not outsourced | Chinese | 70 | 124.50 | 8472 | 8446.64 | 1.47 | Chinese | NA |
| Not outsourced | Any other Asian background | 57 | 118.35 | 8472 | 8446.64 | 1.40 | Any other Asian background | NA |
| Not outsourced | African | 232 | 157.09 | 8472 | 8446.64 | 1.86 | African | NA |
| Not outsourced | Caribbean | 74 | 56.56 | 8472 | 8446.64 | 0.67 | Caribbean | NA |
| Not outsourced | Any other Black, Black British, or Caribbean background | 36 | 25.60 | 8472 | 8446.64 | 0.30 | Any other Black | Black British, or Caribbean background |
| Not outsourced | Arab | 12 | 20.47 | 8472 | 8446.64 | 0.24 | Arab | NA |
| Not outsourced | Any other ethnic group | 13 | 23.40 | 8472 | 8446.64 | 0.28 | Any other ethnic group | NA |
| Not outsourced | Don’t think of myself as any of these | 7 | 5.87 | 8472 | 8446.64 | 0.07 | Don’t think of myself as any of these | NA |
| Not outsourced | Prefer not to say | 22 | 23.54 | 8472 | 8446.64 | 0.28 | Prefer not to say | NA |
| Not outsourced | NA | 282 | 277.44 | 8472 | 8446.64 | 3.28 | NA | NA |
| Outsourced | English / Welsh / Scottish / Northern Irish / British | 1124 | 1143.07 | 1683 | 1708.36 | 66.91 | English | Welsh / Scottish / Northern Irish / British |
| Outsourced | Irish | 17 | 14.86 | 1683 | 1708.36 | 0.87 | Irish | NA |
| Outsourced | Gypsy or Irish Traveller | 2 | 2.48 | 1683 | 1708.36 | 0.14 | Gypsy or Irish Traveller | NA |
| Outsourced | Roma | 3 | 2.25 | 1683 | 1708.36 | 0.13 | Roma | NA |
| Outsourced | Any other White background | 80 | 93.96 | 1683 | 1708.36 | 5.50 | Any other White background | NA |
| Outsourced | White and Black Caribbean | 11 | 5.53 | 1683 | 1708.36 | 0.32 | White and Black Caribbean | NA |
| Outsourced | White and Black African | 26 | 14.61 | 1683 | 1708.36 | 0.86 | White and Black African | NA |
| Outsourced | White and Asian | 12 | 9.13 | 1683 | 1708.36 | 0.53 | White and Asian | NA |
| Outsourced | Any other Mixed / Multiple ethnic background | 23 | 15.89 | 1683 | 1708.36 | 0.93 | Any other Mixed | Multiple ethnic background |
| Outsourced | Indian | 55 | 73.98 | 1683 | 1708.36 | 4.33 | Indian | NA |
| Outsourced | Pakistani | 49 | 53.85 | 1683 | 1708.36 | 3.15 | Pakistani | NA |
| Outsourced | Bangladeshi | 21 | 23.04 | 1683 | 1708.36 | 1.35 | Bangladeshi | NA |
| Outsourced | Chinese | 12 | 21.04 | 1683 | 1708.36 | 1.23 | Chinese | NA |
| Outsourced | Any other Asian background | 26 | 44.80 | 1683 | 1708.36 | 2.62 | Any other Asian background | NA |
| Outsourced | African | 111 | 69.96 | 1683 | 1708.36 | 4.10 | African | NA |
| Outsourced | Caribbean | 15 | 15.11 | 1683 | 1708.36 | 0.88 | Caribbean | NA |
| Outsourced | Any other Black, Black British, or Caribbean background | 16 | 11.79 | 1683 | 1708.36 | 0.69 | Any other Black | Black British, or Caribbean background |
| Outsourced | Arab | 7 | 12.03 | 1683 | 1708.36 | 0.70 | Arab | NA |
| Outsourced | Any other ethnic group | 3 | 7.06 | 1683 | 1708.36 | 0.41 | Any other ethnic group | NA |
| Outsourced | Don’t think of myself as any of these | 5 | 2.94 | 1683 | 1708.36 | 0.17 | Don’t think of myself as any of these | NA |
| Outsourced | Prefer not to say | 4 | 6.92 | 1683 | 1708.36 | 0.40 | Prefer not to say | NA |
| Outsourced | NA | 61 | 64.07 | 1683 | 1708.36 | 3.75 | NA | NA |
data %>%
# get values of labels
# mutate_all(haven::as_factor) %>%
group_by(outsourcing_status, Ethnicity) %>%
summarise(
Frequency = sum(NatRepemployees),
n = n()
) %>%
mutate(
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum),
N = n()
) %>%
ggplot(., aes(outsourcing_status, Percentage, fill = as.factor(Ethnicity))) +
geom_col(colour="black") +
annotate("text", x = ethnicity_statistics$outsourcing_status, y = 75, label = paste0("n=",ethnicity_statistics$Sum)) +
coord_flip() +
scale_fill_manual(values = many_colours, name = "Ethnicity") +
xlab("Outsourcing group") +
theme_minimal()ethnicity_key <- data.frame("number" = seq(1,22,1),
"ethnicity" = c(levels(ethnicity_statistics$Ethnicity_labelled), NA))
ethnicity_key %>%
kable() %>%
kable_styling(full_width = F)| number | ethnicity |
|---|---|
| 1 | English / Welsh / Scottish / Northern Irish / British |
| 2 | Irish |
| 3 | Gypsy or Irish Traveller |
| 4 | Roma |
| 5 | Any other White background |
| 6 | White and Black Caribbean |
| 7 | White and Black African |
| 8 | White and Asian |
| 9 | Any other Mixed / Multiple ethnic background |
| 10 | Indian |
| 11 | Pakistani |
| 12 | Bangladeshi |
| 13 | Chinese |
| 14 | Any other Asian background |
| 15 | African |
| 16 | Caribbean |
| 17 | Any other Black, Black British, or Caribbean background |
| 18 | Arab |
| 19 | Any other ethnic group |
| 20 | Don’t think of myself as any of these |
| 21 | Prefer not to say |
| 22 | NA |
ethnicity_statistics_2 <- data %>%
group_by(outsourcing_status, Ethnicity_collapsed) %>%
summarise(
n = n(), # count cases
Frequency = sum(NatRepemployees) # count weighted cases
) %>%
mutate(
N = sum(n),
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum)
) #%>%
# separate_wider_delim(Ethnicity_short,
# names = c("Ethnicity_short", "Ethnicity detail"),
# delim = stringr::regex(" / |, "), # use multiple delims
# too_few = "align_start",
# too_many = "merge")
#readr::write_csv(ethnicity_statistics, file = "../outputs/data/ethnicity_stats_2.csv")ethnicity_statistics_2 %>%
# mutate(
# Ethnicity = haven::as_factor(Ethnicity)
# ) %>%
knitr::kable(.,digits = 2) %>%
kable_styling(full_width = F)| outsourcing_status | Ethnicity_collapsed | n | Frequency | N | Sum | Percentage |
|---|---|---|---|---|---|---|
| Not outsourced | White | 6734 | 6687.91 | 8472 | 8446.64 | 79.18 |
| Not outsourced | White other | 379 | 398.98 | 8472 | 8446.64 | 4.72 |
| Not outsourced | Black Caribbean | 158 | 109.78 | 8472 | 8446.64 | 1.30 |
| Not outsourced | Black African | 272 | 177.52 | 8472 | 8446.64 | 2.10 |
| Not outsourced | Mixed other | 120 | 65.99 | 8472 | 8446.64 | 0.78 |
| Not outsourced | South Asian | 310 | 387.30 | 8472 | 8446.64 | 4.59 |
| Not outsourced | East Asian | 70 | 124.50 | 8472 | 8446.64 | 1.47 |
| Not outsourced | Other | 381 | 448.60 | 8472 | 8446.64 | 5.31 |
| Not outsourced | Black other | 36 | 25.60 | 8472 | 8446.64 | 0.30 |
| Not outsourced | Arab | 12 | 20.47 | 8472 | 8446.64 | 0.24 |
| Outsourced | White | 1141 | 1157.93 | 1683 | 1708.36 | 67.78 |
| Outsourced | White other | 85 | 98.68 | 1683 | 1708.36 | 5.78 |
| Outsourced | Black Caribbean | 26 | 20.64 | 1683 | 1708.36 | 1.21 |
| Outsourced | Black African | 137 | 84.58 | 1683 | 1708.36 | 4.95 |
| Outsourced | Mixed other | 35 | 25.02 | 1683 | 1708.36 | 1.46 |
| Outsourced | South Asian | 125 | 150.87 | 1683 | 1708.36 | 8.83 |
| Outsourced | East Asian | 12 | 21.04 | 1683 | 1708.36 | 1.23 |
| Outsourced | Other | 99 | 125.78 | 1683 | 1708.36 | 7.36 |
| Outsourced | Black other | 16 | 11.79 | 1683 | 1708.36 | 0.69 |
| Outsourced | Arab | 7 | 12.03 | 1683 | 1708.36 | 0.70 |
data %>%
# get values of labels
# mutate_all(haven::as_factor) %>%
group_by(outsourcing_status, Ethnicity_collapsed) %>%
summarise(
Frequency = sum(NatRepemployees),
n = n()
) %>%
mutate(
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum),
N = n()
) %>%
ggplot(., aes(outsourcing_status, Percentage, fill = Ethnicity_collapsed)) +
geom_col(colour="black") +
annotate("text", x = ethnicity_statistics$outsourcing_status, y = 75, label = paste0("n=",ethnicity_statistics$N)) +
coord_flip() +
scale_fill_manual(values = many_colours, name = "Ethnicity") +
xlab("Outsourcing group") +
theme_minimal()mod <- glm(outsourcing_status ~ Ethnicity_collapsed, data, family="binomial", weights=NatRepemployees)
summary(mod)
Call:
glm(formula = outsourcing_status ~ Ethnicity_collapsed, family = "binomial",
data = data, weights = NatRepemployees)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.75366 0.03183 -55.095 < 2e-16 ***
Ethnicity_collapsedWhite other 0.35669 0.11685 3.053 0.002268 **
Ethnicity_collapsedBlack Caribbean 0.08232 0.24203 0.340 0.733754
Ethnicity_collapsedBlack African 1.01225 0.13590 7.448 9.45e-14 ***
Ethnicity_collapsedMixed other 0.78383 0.23693 3.308 0.000939 ***
Ethnicity_collapsedSouth Asian 0.81086 0.10111 8.020 1.06e-15 ***
Ethnicity_collapsedEast Asian -0.02436 0.23787 -0.102 0.918417
Ethnicity_collapsedOther 0.48211 0.10579 4.557 5.19e-06 ***
Ethnicity_collapsedBlack other 0.97803 0.35344 2.767 0.005655 **
Ethnicity_collapsedArab 1.22173 0.36469 3.350 0.000808 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 9201.8 on 10154 degrees of freedom
Residual deviance: 9065.3 on 10145 degrees of freedom
AIC: 10088
Number of Fisher Scoring iterations: 4
coef_table <- data.frame("estimate" = mod[["coefficients"]]) %>%
mutate(
or = round(exp(estimate),2)
)Comparison of ethnicities indicates that some groups are statistically more likely to be outsourced than others
White other, Black Caribbean, and East Asian workers are no more or less likely than White workers to be outsourced.
ethnicity_summary_paysplit <- data %>%
group_by(outsourcing_status, income_group, Ethnicity_labelled) %>%
summarise(
n = n(),
Frequency = sum(NatRepemployees)
) %>%
mutate(
N = sum(n),
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum),
Ethnicity_short = Ethnicity_labelled
) %>%
separate_wider_delim(Ethnicity_short,
names = c("Ethnicity_short", "Ethnicity detail"),
delim = stringr::regex(" / |, "), # use multiple delims
too_few = "align_start",
too_many = "merge")
readr::write_csv(ethnicity_statistics, file = "../outputs/data/ethnicity_stats_paysplit_1.csv")temp_data <- ethnicity_summary_paysplit %>%
drop_na(income_group)
for(group in unique(temp_data$income_group)){
plot_data <- temp_data %>%
filter(income_group==group)
plot <- plot_data %>%
ggplot(., aes(Ethnicity_short, Percentage, fill = outsourcing_status)) +
geom_col(colour="black", position = position_dodge()) +
#annotate("text", x = ethnicity_statistics$outsourcing_status, y = 75, label = paste0("n=",ethnicity_statistics$Sum)) +
coord_flip() +
scale_fill_manual(values = many_colours, name = "Ethnicity") +
xlab("Outsourcing group") +
theme_minimal() +
theme(
legend.position = "bottom"
) +
ggtitle(paste0(group, " income"))
print(plot)
}# ethnicity_summary_paysplit %>%
# drop_na(income_group) %>%
# ggplot(., aes(outsourcing_status, Percentage, fill = Ethnicity_short)) +
# facet_grid(rows=~income_group) +
# geom_col(colour="black") +
# #annotate("text", x = ethnicity_statistics$outsourcing_status, y = 75, label = paste0("n=",ethnicity_statistics$Sum)) +
# coord_flip() +
# scale_fill_manual(values = many_colours, name = "Ethnicity") +
# xlab("Outsourcing group") +
# theme_minimal() +
# theme(
# legend.position = "bottom"
# )ethnicity_summary_paysplit <- data %>%
group_by(outsourcing_status, income_group, Ethnicity_collapsed) %>%
summarise(
n = n(),
Frequency = sum(NatRepemployees)
) %>%
mutate(
N = sum(n),
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum)
)
#readr::write_csv(ethnicity_statistics, file = "../outputs/data/ethnicity_stats_paysplit_1.csv")temp_data <- ethnicity_summary_paysplit %>%
drop_na(income_group)
for(group in unique(temp_data$income_group)){
plot_data <- temp_data %>%
filter(income_group==group)
plot <- plot_data %>%
ggplot(., aes(Ethnicity_collapsed, Percentage, fill = outsourcing_status)) +
geom_col(colour="black", position = position_dodge()) +
#annotate("text", x = ethnicity_statistics$outsourcing_status, y = 75, label = paste0("n=",ethnicity_statistics$Sum)) +
coord_flip() +
scale_fill_manual(values = many_colours, name = "Ethnicity") +
xlab("Outsourcing group") +
theme_minimal() +
theme(
legend.position = "bottom"
) +
ggtitle(paste0(group, " income"))
print(plot)
}# ethnicity_summary_paysplit %>%
# drop_na(income_group) %>%
# ggplot(., aes(outsourcing_status, Percentage, fill = Ethnicity_short)) +
# facet_grid(rows=~income_group) +
# geom_col(colour="black") +
# #annotate("text", x = ethnicity_statistics$outsourcing_status, y = 75, label = paste0("n=",ethnicity_statistics$Sum)) +
# coord_flip() +
# scale_fill_manual(values = many_colours, name = "Ethnicity") +
# xlab("Outsourcing group") +
# theme_minimal() +
# theme(
# legend.position = "bottom"
# )mod <- glm(outsourcing_status ~ income_group*Ethnicity_collapsed, data, family="binomial")
summary(mod)
Call:
glm(formula = outsourcing_status ~ income_group * Ethnicity_collapsed,
family = "binomial", data = data)
Coefficients:
Estimate Std. Error z value
(Intercept) -1.83224 0.04438 -41.286
income_groupLow 0.17246 0.06883 2.506
Ethnicity_collapsedWhite other 0.33035 0.16107 2.051
Ethnicity_collapsedBlack Caribbean -0.19371 0.31031 -0.624
Ethnicity_collapsedBlack African 1.23328 0.13921 8.859
Ethnicity_collapsedMixed other 0.23623 0.28662 0.824
Ethnicity_collapsedSouth Asian 0.92653 0.14299 6.480
Ethnicity_collapsedEast Asian 0.17932 0.36655 0.489
Ethnicity_collapsedOther 0.42737 0.18195 2.349
Ethnicity_collapsedBlack other 1.19039 0.39319 3.027
Ethnicity_collapsedArab 1.32142 0.51830 2.550
income_groupLow:Ethnicity_collapsedWhite other -0.13691 0.27257 -0.502
income_groupLow:Ethnicity_collapsedBlack Caribbean 0.55421 0.45290 1.224
income_groupLow:Ethnicity_collapsedBlack African -0.44101 0.23655 -1.864
income_groupLow:Ethnicity_collapsedMixed other 0.64585 0.41324 1.563
income_groupLow:Ethnicity_collapsedSouth Asian -0.01517 0.23811 -0.064
income_groupLow:Ethnicity_collapsedEast Asian -0.12898 0.73289 -0.176
income_groupLow:Ethnicity_collapsedOther 0.33341 0.27581 1.209
income_groupLow:Ethnicity_collapsedBlack other -0.70927 0.69591 -1.019
income_groupLow:Ethnicity_collapsedArab -0.35479 1.33094 -0.267
Pr(>|z|)
(Intercept) < 2e-16 ***
income_groupLow 0.01222 *
Ethnicity_collapsedWhite other 0.04027 *
Ethnicity_collapsedBlack Caribbean 0.53246
Ethnicity_collapsedBlack African < 2e-16 ***
Ethnicity_collapsedMixed other 0.40983
Ethnicity_collapsedSouth Asian 9.19e-11 ***
Ethnicity_collapsedEast Asian 0.62469
Ethnicity_collapsedOther 0.01883 *
Ethnicity_collapsedBlack other 0.00247 **
Ethnicity_collapsedArab 0.01079 *
income_groupLow:Ethnicity_collapsedWhite other 0.61548
income_groupLow:Ethnicity_collapsedBlack Caribbean 0.22107
income_groupLow:Ethnicity_collapsedBlack African 0.06228 .
income_groupLow:Ethnicity_collapsedMixed other 0.11808
income_groupLow:Ethnicity_collapsedSouth Asian 0.94921
income_groupLow:Ethnicity_collapsedEast Asian 0.86030
income_groupLow:Ethnicity_collapsedOther 0.22673
income_groupLow:Ethnicity_collapsedBlack other 0.30811
income_groupLow:Ethnicity_collapsedArab 0.78980
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 8127.6 on 8942 degrees of freedom
Residual deviance: 7956.9 on 8923 degrees of freedom
(1212 observations deleted due to missingness)
AIC: 7996.9
Number of Fisher Scoring iterations: 4
test_data <- data %>%
drop_na(income_group) %>%
mutate(
income_group = factor(income_group, levels = c("Not low", "Low")),
Ethnicity_short = Ethnicity_labelled
) %>%
separate_wider_delim(Ethnicity_short,
names = c("Ethnicity_short", "Ethnicity detail"),
delim = " / ",
too_few = "align_start",
too_many = "merge") %>%
mutate(
Ethnicity_short = forcats::fct_relevel(factor(Ethnicity_short), "English")
)
test <- glm(outsourcing_status ~ Age + Gender + income_group + Ethnicity_short, family = "binomial", data = test_data)
summary(test)
Call:
glm(formula = outsourcing_status ~ Age + Gender + income_group +
Ethnicity_short, family = "binomial", data = test_data)
Coefficients:
Estimate
(Intercept) -1.122623
Age -0.024661
GenderMale 0.531731
GenderOther 0.128987
GenderPrefer not to say -0.240452
income_groupLow 0.276827
Ethnicity_shortAfrican 0.750804
Ethnicity_shortAny other Asian background 0.691922
Ethnicity_shortAny other Black, Black British, or Caribbean background 0.752002
Ethnicity_shortAny other ethnic group 0.220800
Ethnicity_shortAny other Mixed 0.673451
Ethnicity_shortAny other White background 0.171750
Ethnicity_shortArab 0.989803
Ethnicity_shortBangladeshi 0.703248
Ethnicity_shortCaribbean 0.183752
Ethnicity_shortChinese 0.021374
Ethnicity_shortDon’t think of myself as any of these 1.449163
Ethnicity_shortGypsy or Irish Traveller 0.282974
Ethnicity_shortIndian 0.548848
Ethnicity_shortIrish -0.126226
Ethnicity_shortPakistani 0.864916
Ethnicity_shortPrefer not to say -0.006321
Ethnicity_shortRoma 1.082635
Ethnicity_shortWhite and Asian -0.090237
Ethnicity_shortWhite and Black African 1.155726
Ethnicity_shortWhite and Black Caribbean -0.279236
Std. Error
(Intercept) 0.111181
Age 0.002392
GenderMale 0.061348
GenderOther 0.780590
GenderPrefer not to say 1.056849
income_groupLow 0.061893
Ethnicity_shortAfrican 0.125846
Ethnicity_shortAny other Asian background 0.258347
Ethnicity_shortAny other Black, Black British, or Caribbean background 0.330047
Ethnicity_shortAny other ethnic group 0.647274
Ethnicity_shortAny other Mixed 0.264347
Ethnicity_shortAny other White background 0.135112
Ethnicity_shortArab 0.483591
Ethnicity_shortBangladeshi 0.286337
Ethnicity_shortCaribbean 0.300048
Ethnicity_shortChinese 0.319975
Ethnicity_shortDon’t think of myself as any of these 0.659571
Ethnicity_shortGypsy or Irish Traveller 0.843433
Ethnicity_shortIndian 0.163023
Ethnicity_shortIrish 0.292763
Ethnicity_shortPakistani 0.188944
Ethnicity_shortPrefer not to say 0.633334
Ethnicity_shortRoma 0.769140
Ethnicity_shortWhite and Asian 0.335327
Ethnicity_shortWhite and Black African 0.262938
Ethnicity_shortWhite and Black Caribbean 0.345023
z value
(Intercept) -10.097
Age -10.311
GenderMale 8.667
GenderOther 0.165
GenderPrefer not to say -0.228
income_groupLow 4.473
Ethnicity_shortAfrican 5.966
Ethnicity_shortAny other Asian background 2.678
Ethnicity_shortAny other Black, Black British, or Caribbean background 2.278
Ethnicity_shortAny other ethnic group 0.341
Ethnicity_shortAny other Mixed 2.548
Ethnicity_shortAny other White background 1.271
Ethnicity_shortArab 2.047
Ethnicity_shortBangladeshi 2.456
Ethnicity_shortCaribbean 0.612
Ethnicity_shortChinese 0.067
Ethnicity_shortDon’t think of myself as any of these 2.197
Ethnicity_shortGypsy or Irish Traveller 0.336
Ethnicity_shortIndian 3.367
Ethnicity_shortIrish -0.431
Ethnicity_shortPakistani 4.578
Ethnicity_shortPrefer not to say -0.010
Ethnicity_shortRoma 1.408
Ethnicity_shortWhite and Asian -0.269
Ethnicity_shortWhite and Black African 4.395
Ethnicity_shortWhite and Black Caribbean -0.809
Pr(>|z|)
(Intercept) < 2e-16
Age < 2e-16
GenderMale < 2e-16
GenderOther 0.868753
GenderPrefer not to say 0.820021
income_groupLow 7.73e-06
Ethnicity_shortAfrican 2.43e-09
Ethnicity_shortAny other Asian background 0.007400
Ethnicity_shortAny other Black, Black British, or Caribbean background 0.022698
Ethnicity_shortAny other ethnic group 0.733011
Ethnicity_shortAny other Mixed 0.010847
Ethnicity_shortAny other White background 0.203669
Ethnicity_shortArab 0.040680
Ethnicity_shortBangladeshi 0.014049
Ethnicity_shortCaribbean 0.540267
Ethnicity_shortChinese 0.946741
Ethnicity_shortDon’t think of myself as any of these 0.028011
Ethnicity_shortGypsy or Irish Traveller 0.737246
Ethnicity_shortIndian 0.000761
Ethnicity_shortIrish 0.666357
Ethnicity_shortPakistani 4.70e-06
Ethnicity_shortPrefer not to say 0.992037
Ethnicity_shortRoma 0.159252
Ethnicity_shortWhite and Asian 0.787852
Ethnicity_shortWhite and Black African 1.11e-05
Ethnicity_shortWhite and Black Caribbean 0.418329
(Intercept) ***
Age ***
GenderMale ***
GenderOther
GenderPrefer not to say
income_groupLow ***
Ethnicity_shortAfrican ***
Ethnicity_shortAny other Asian background **
Ethnicity_shortAny other Black, Black British, or Caribbean background *
Ethnicity_shortAny other ethnic group
Ethnicity_shortAny other Mixed *
Ethnicity_shortAny other White background
Ethnicity_shortArab *
Ethnicity_shortBangladeshi *
Ethnicity_shortCaribbean
Ethnicity_shortChinese
Ethnicity_shortDon’t think of myself as any of these *
Ethnicity_shortGypsy or Irish Traveller
Ethnicity_shortIndian ***
Ethnicity_shortIrish
Ethnicity_shortPakistani ***
Ethnicity_shortPrefer not to say
Ethnicity_shortRoma
Ethnicity_shortWhite and Asian
Ethnicity_shortWhite and Black African ***
Ethnicity_shortWhite and Black Caribbean
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 7918.6 on 8739 degrees of freedom
Residual deviance: 7567.2 on 8714 degrees of freedom
(203 observations deleted due to missingness)
AIC: 7619.2
Number of Fisher Scoring iterations: 4
Another way of looking at this is to calculate, for each ethnicity, the proportion of workers in each outsourcing group. Doing so yields the plot below.14
ethnicity_statistics <- data %>%
# get values of labels
# mutate_all(haven::as_factor) %>%
group_by(Ethnicity, outsourcing_status) %>%
summarise(
Frequency = n()
) %>%
mutate(
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum)
) %>%
rename(
`Outsourcing group` = outsourcing_status
)
ethnicity_statistics %>%
mutate(
Ethnicity = haven::as_factor(Ethnicity)
) %>%
knitr::kable(.,digits = 2) %>%
kable_styling(full_width = F)| Ethnicity | Outsourcing group | Frequency | Sum | Percentage |
|---|---|---|---|---|
| English / Welsh / Scottish / Northern Irish / British | Not outsourced | 6623 | 7747 | 85.49 |
| English / Welsh / Scottish / Northern Irish / British | Outsourced | 1124 | 7747 | 14.51 |
| Irish | Not outsourced | 111 | 128 | 86.72 |
| Irish | Outsourced | 17 | 128 | 13.28 |
| Gypsy or Irish Traveller | Not outsourced | 6 | 8 | 75.00 |
| Gypsy or Irish Traveller | Outsourced | 2 | 8 | 25.00 |
| Roma | Not outsourced | 4 | 7 | 57.14 |
| Roma | Outsourced | 3 | 7 | 42.86 |
| Any other White background | Not outsourced | 369 | 449 | 82.18 |
| Any other White background | Outsourced | 80 | 449 | 17.82 |
| White and Black Caribbean | Not outsourced | 84 | 95 | 88.42 |
| White and Black Caribbean | Outsourced | 11 | 95 | 11.58 |
| White and Black African | Not outsourced | 40 | 66 | 60.61 |
| White and Black African | Outsourced | 26 | 66 | 39.39 |
| White and Asian | Not outsourced | 61 | 73 | 83.56 |
| White and Asian | Outsourced | 12 | 73 | 16.44 |
| Any other Mixed / Multiple ethnic background | Not outsourced | 59 | 82 | 71.95 |
| Any other Mixed / Multiple ethnic background | Outsourced | 23 | 82 | 28.05 |
| Indian | Not outsourced | 170 | 225 | 75.56 |
| Indian | Outsourced | 55 | 225 | 24.44 |
| Pakistani | Not outsourced | 93 | 142 | 65.49 |
| Pakistani | Outsourced | 49 | 142 | 34.51 |
| Bangladeshi | Not outsourced | 47 | 68 | 69.12 |
| Bangladeshi | Outsourced | 21 | 68 | 30.88 |
| Chinese | Not outsourced | 70 | 82 | 85.37 |
| Chinese | Outsourced | 12 | 82 | 14.63 |
| Any other Asian background | Not outsourced | 57 | 83 | 68.67 |
| Any other Asian background | Outsourced | 26 | 83 | 31.33 |
| African | Not outsourced | 232 | 343 | 67.64 |
| African | Outsourced | 111 | 343 | 32.36 |
| Caribbean | Not outsourced | 74 | 89 | 83.15 |
| Caribbean | Outsourced | 15 | 89 | 16.85 |
| Any other Black, Black British, or Caribbean background | Not outsourced | 36 | 52 | 69.23 |
| Any other Black, Black British, or Caribbean background | Outsourced | 16 | 52 | 30.77 |
| Arab | Not outsourced | 12 | 19 | 63.16 |
| Arab | Outsourced | 7 | 19 | 36.84 |
| Any other ethnic group | Not outsourced | 13 | 16 | 81.25 |
| Any other ethnic group | Outsourced | 3 | 16 | 18.75 |
| Don’t think of myself as any of these | Not outsourced | 7 | 12 | 58.33 |
| Don’t think of myself as any of these | Outsourced | 5 | 12 | 41.67 |
| Prefer not to say | Not outsourced | 22 | 26 | 84.62 |
| Prefer not to say | Outsourced | 4 | 26 | 15.38 |
| NA | Not outsourced | 282 | 343 | 82.22 |
| NA | Outsourced | 61 | 343 | 17.78 |
# ordering_df <- ethnicity_statistics %>%
# filter(`Outsourcing group` == "Outsourced") %>%
# mutate(
# Ethnicity = haven::as_factor(Ethnicity),
# Ethnicity = factor(Ethnicity),
# Ethnicity = forcats::fct_reorder(Ethnicity, Percentage)
# )
ethnicity_statistics %>%
mutate(
Ethnicity = haven::as_factor(Ethnicity)
) %>%
ggplot(., aes(Ethnicity, Percentage, fill = `Outsourcing group`)) +
geom_col(colour="black") +
annotate("text", x = ethnicity_statistics$Ethnicity, y = 75, label = paste0("n=", ethnicity_statistics$Sum)) +
coord_flip() +
scale_fill_manual(values=many_colours, name = "Ethnicity")write_csv(ethnicity_statistics, file="../outputs/data/ethnicity_stats_2.csv")ethnicity_statistics <- data %>%
group_by(outsourcing_group, Ethnicity_labelled) %>%
summarise(
n = n(), # count cases
Frequency = sum(NatRepemployees) # count weighted cases
) %>%
mutate(
N = sum(n),
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum),
Ethnicity_short = Ethnicity_labelled
) %>%
separate_wider_delim(Ethnicity_short,
names = c("Ethnicity_short", "Ethnicity detail"),
delim = stringr::regex(" / |, "), # use multiple delims
too_few = "align_start",
too_many = "merge")
readr::write_csv(ethnicity_statistics, file = "../outputs/data/ethnicity_stats_outsourcing_groups_1.csv")# test <- multinom(outsourcing_group ~ Ethnicity_collapsed, data, weights = NatRepemployees)
# summary(test)
#
# z <- summary(test)$coefficients/summary(test)$standard.errors
# z
#
# p <- (1 - pnorm(abs(z), 0, 1)) * 2
# p
#
# # Assuming your dataframe is named 'p'
# p_2 <- apply(p, 2, function(x) ifelse(x < 0.01, 1, NA))
#
# sig_ors <- exp(summary(test)$coefficients * p_2)
# we can take the results of this forward and plot the orsethnicity_statistics %>%
# mutate(
# Ethnicity = haven::as_factor(Ethnicity)
# ) %>%
knitr::kable(.,digits = 2) %>%
kable_styling(full_width = F)| outsourcing_group | Ethnicity_labelled | n | Frequency | N | Sum | Percentage | Ethnicity_short | Ethnicity detail |
|---|---|---|---|---|---|---|---|---|
| Not outsourced | English / Welsh / Scottish / Northern Irish / British | 6623 | 6589.16 | 8472 | 8446.64 | 78.01 | English | Welsh / Scottish / Northern Irish / British |
| Not outsourced | Irish | 111 | 98.75 | 8472 | 8446.64 | 1.17 | Irish | NA |
| Not outsourced | Gypsy or Irish Traveller | 6 | 8.32 | 8472 | 8446.64 | 0.10 | Gypsy or Irish Traveller | NA |
| Not outsourced | Roma | 4 | 5.24 | 8472 | 8446.64 | 0.06 | Roma | NA |
| Not outsourced | Any other White background | 369 | 385.42 | 8472 | 8446.64 | 4.56 | Any other White background | NA |
| Not outsourced | White and Black Caribbean | 84 | 53.22 | 8472 | 8446.64 | 0.63 | White and Black Caribbean | NA |
| Not outsourced | White and Black African | 40 | 20.43 | 8472 | 8446.64 | 0.24 | White and Black African | NA |
| Not outsourced | White and Asian | 61 | 32.39 | 8472 | 8446.64 | 0.38 | White and Asian | NA |
| Not outsourced | Any other Mixed / Multiple ethnic background | 59 | 33.60 | 8472 | 8446.64 | 0.40 | Any other Mixed | Multiple ethnic background |
| Not outsourced | Indian | 170 | 237.76 | 8472 | 8446.64 | 2.81 | Indian | NA |
| Not outsourced | Pakistani | 93 | 96.09 | 8472 | 8446.64 | 1.14 | Pakistani | NA |
| Not outsourced | Bangladeshi | 47 | 53.46 | 8472 | 8446.64 | 0.63 | Bangladeshi | NA |
| Not outsourced | Chinese | 70 | 124.50 | 8472 | 8446.64 | 1.47 | Chinese | NA |
| Not outsourced | Any other Asian background | 57 | 118.35 | 8472 | 8446.64 | 1.40 | Any other Asian background | NA |
| Not outsourced | African | 232 | 157.09 | 8472 | 8446.64 | 1.86 | African | NA |
| Not outsourced | Caribbean | 74 | 56.56 | 8472 | 8446.64 | 0.67 | Caribbean | NA |
| Not outsourced | Any other Black, Black British, or Caribbean background | 36 | 25.60 | 8472 | 8446.64 | 0.30 | Any other Black | Black British, or Caribbean background |
| Not outsourced | Arab | 12 | 20.47 | 8472 | 8446.64 | 0.24 | Arab | NA |
| Not outsourced | Any other ethnic group | 13 | 23.40 | 8472 | 8446.64 | 0.28 | Any other ethnic group | NA |
| Not outsourced | Don’t think of myself as any of these | 7 | 5.87 | 8472 | 8446.64 | 0.07 | Don’t think of myself as any of these | NA |
| Not outsourced | Prefer not to say | 22 | 23.54 | 8472 | 8446.64 | 0.28 | Prefer not to say | NA |
| Not outsourced | NA | 282 | 277.44 | 8472 | 8446.64 | 3.28 | NA | NA |
| Outsourced | English / Welsh / Scottish / Northern Irish / British | 742 | 778.19 | 1123 | 1161.08 | 67.02 | English | Welsh / Scottish / Northern Irish / British |
| Outsourced | Irish | 12 | 11.23 | 1123 | 1161.08 | 0.97 | Irish | NA |
| Outsourced | Gypsy or Irish Traveller | 2 | 2.48 | 1123 | 1161.08 | 0.21 | Gypsy or Irish Traveller | NA |
| Outsourced | Roma | 2 | 1.48 | 1123 | 1161.08 | 0.13 | Roma | NA |
| Outsourced | Any other White background | 63 | 72.25 | 1123 | 1161.08 | 6.22 | Any other White background | NA |
| Outsourced | White and Black Caribbean | 8 | 3.87 | 1123 | 1161.08 | 0.33 | White and Black Caribbean | NA |
| Outsourced | White and Black African | 21 | 11.08 | 1123 | 1161.08 | 0.95 | White and Black African | NA |
| Outsourced | White and Asian | 9 | 5.80 | 1123 | 1161.08 | 0.50 | White and Asian | NA |
| Outsourced | Any other Mixed / Multiple ethnic background | 15 | 9.84 | 1123 | 1161.08 | 0.85 | Any other Mixed | Multiple ethnic background |
| Outsourced | Indian | 32 | 43.96 | 1123 | 1161.08 | 3.79 | Indian | NA |
| Outsourced | Pakistani | 29 | 32.69 | 1123 | 1161.08 | 2.82 | Pakistani | NA |
| Outsourced | Bangladeshi | 15 | 17.95 | 1123 | 1161.08 | 1.55 | Bangladeshi | NA |
| Outsourced | Chinese | 7 | 12.75 | 1123 | 1161.08 | 1.10 | Chinese | NA |
| Outsourced | Any other Asian background | 17 | 30.35 | 1123 | 1161.08 | 2.61 | Any other Asian background | NA |
| Outsourced | African | 74 | 47.20 | 1123 | 1161.08 | 4.07 | African | NA |
| Outsourced | Caribbean | 10 | 10.40 | 1123 | 1161.08 | 0.90 | Caribbean | NA |
| Outsourced | Any other Black, Black British, or Caribbean background | 13 | 9.46 | 1123 | 1161.08 | 0.81 | Any other Black | Black British, or Caribbean background |
| Outsourced | Arab | 3 | 4.97 | 1123 | 1161.08 | 0.43 | Arab | NA |
| Outsourced | Any other ethnic group | 1 | 1.52 | 1123 | 1161.08 | 0.13 | Any other ethnic group | NA |
| Outsourced | Don’t think of myself as any of these | 4 | 2.54 | 1123 | 1161.08 | 0.22 | Don’t think of myself as any of these | NA |
| Outsourced | Prefer not to say | 1 | 1.67 | 1123 | 1161.08 | 0.14 | Prefer not to say | NA |
| Outsourced | NA | 43 | 49.38 | 1123 | 1161.08 | 4.25 | NA | NA |
| Likely agency | English / Welsh / Scottish / Northern Irish / British | 180 | 174.33 | 269 | 266.54 | 65.41 | English | Welsh / Scottish / Northern Irish / British |
| Likely agency | Irish | 1 | 0.66 | 269 | 266.54 | 0.25 | Irish | NA |
| Likely agency | Roma | 1 | 0.77 | 269 | 266.54 | 0.29 | Roma | NA |
| Likely agency | Any other White background | 10 | 13.33 | 269 | 266.54 | 5.00 | Any other White background | NA |
| Likely agency | White and Black Caribbean | 1 | 0.69 | 269 | 266.54 | 0.26 | White and Black Caribbean | NA |
| Likely agency | White and Black African | 2 | 0.91 | 269 | 266.54 | 0.34 | White and Black African | NA |
| Likely agency | White and Asian | 2 | 2.54 | 269 | 266.54 | 0.95 | White and Asian | NA |
| Likely agency | Any other Mixed / Multiple ethnic background | 5 | 4.33 | 269 | 266.54 | 1.63 | Any other Mixed | Multiple ethnic background |
| Likely agency | Indian | 8 | 11.83 | 269 | 266.54 | 4.44 | Indian | NA |
| Likely agency | Pakistani | 8 | 9.74 | 269 | 266.54 | 3.65 | Pakistani | NA |
| Likely agency | Bangladeshi | 3 | 2.61 | 269 | 266.54 | 0.98 | Bangladeshi | NA |
| Likely agency | Chinese | 1 | 1.58 | 269 | 266.54 | 0.59 | Chinese | NA |
| Likely agency | Any other Asian background | 5 | 8.34 | 269 | 266.54 | 3.13 | Any other Asian background | NA |
| Likely agency | African | 22 | 12.82 | 269 | 266.54 | 4.81 | African | NA |
| Likely agency | Caribbean | 3 | 3.39 | 269 | 266.54 | 1.27 | Caribbean | NA |
| Likely agency | Any other Black, Black British, or Caribbean background | 1 | 1.16 | 269 | 266.54 | 0.44 | Any other Black | Black British, or Caribbean background |
| Likely agency | Arab | 2 | 3.42 | 269 | 266.54 | 1.28 | Arab | NA |
| Likely agency | Any other ethnic group | 1 | 3.93 | 269 | 266.54 | 1.48 | Any other ethnic group | NA |
| Likely agency | Don’t think of myself as any of these | 1 | 0.40 | 269 | 266.54 | 0.15 | Don’t think of myself as any of these | NA |
| Likely agency | Prefer not to say | 1 | 0.52 | 269 | 266.54 | 0.20 | Prefer not to say | NA |
| Likely agency | NA | 11 | 9.20 | 269 | 266.54 | 3.45 | NA | NA |
| High indicators | English / Welsh / Scottish / Northern Irish / British | 202 | 190.56 | 291 | 280.74 | 67.88 | English | Welsh / Scottish / Northern Irish / British |
| High indicators | Irish | 4 | 2.97 | 291 | 280.74 | 1.06 | Irish | NA |
| High indicators | Any other White background | 7 | 8.37 | 291 | 280.74 | 2.98 | Any other White background | NA |
| High indicators | White and Black Caribbean | 2 | 0.97 | 291 | 280.74 | 0.35 | White and Black Caribbean | NA |
| High indicators | White and Black African | 3 | 2.62 | 291 | 280.74 | 0.93 | White and Black African | NA |
| High indicators | White and Asian | 1 | 0.78 | 291 | 280.74 | 0.28 | White and Asian | NA |
| High indicators | Any other Mixed / Multiple ethnic background | 3 | 1.71 | 291 | 280.74 | 0.61 | Any other Mixed | Multiple ethnic background |
| High indicators | Indian | 15 | 18.18 | 291 | 280.74 | 6.48 | Indian | NA |
| High indicators | Pakistani | 12 | 11.43 | 291 | 280.74 | 4.07 | Pakistani | NA |
| High indicators | Bangladeshi | 3 | 2.48 | 291 | 280.74 | 0.88 | Bangladeshi | NA |
| High indicators | Chinese | 4 | 6.70 | 291 | 280.74 | 2.39 | Chinese | NA |
| High indicators | Any other Asian background | 4 | 6.10 | 291 | 280.74 | 2.17 | Any other Asian background | NA |
| High indicators | African | 15 | 9.93 | 291 | 280.74 | 3.54 | African | NA |
| High indicators | Caribbean | 2 | 1.31 | 291 | 280.74 | 0.47 | Caribbean | NA |
| High indicators | Any other Black, Black British, or Caribbean background | 2 | 1.16 | 291 | 280.74 | 0.41 | Any other Black | Black British, or Caribbean background |
| High indicators | Arab | 2 | 3.63 | 291 | 280.74 | 1.29 | Arab | NA |
| High indicators | Any other ethnic group | 1 | 1.60 | 291 | 280.74 | 0.57 | Any other ethnic group | NA |
| High indicators | Prefer not to say | 2 | 4.72 | 291 | 280.74 | 1.68 | Prefer not to say | NA |
| High indicators | NA | 7 | 5.50 | 291 | 280.74 | 1.96 | NA | NA |
data %>%
# get values of labels
# mutate_all(haven::as_factor) %>%
group_by(outsourcing_group, Ethnicity) %>%
summarise(
Frequency = sum(NatRepemployees),
n = n()
) %>%
mutate(
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum),
N = n()
) %>%
ggplot(., aes(outsourcing_group, Percentage, fill = as.factor(Ethnicity))) +
geom_col(colour="black") +
annotate("text", x = ethnicity_statistics$outsourcing_group, y = 75, label = paste0("n=",ethnicity_statistics$Sum)) +
coord_flip() +
scale_fill_manual(values = many_colours, name = "Ethnicity") +
xlab("Outsourcing group") +
theme_minimal()ethnicity_key <- data.frame("number" = seq(1,22,1),
"ethnicity" = c(levels(ethnicity_statistics$Ethnicity_labelled), NA))
ethnicity_key %>%
kable() %>%
kable_styling(full_width = F)| number | ethnicity |
|---|---|
| 1 | English / Welsh / Scottish / Northern Irish / British |
| 2 | Irish |
| 3 | Gypsy or Irish Traveller |
| 4 | Roma |
| 5 | Any other White background |
| 6 | White and Black Caribbean |
| 7 | White and Black African |
| 8 | White and Asian |
| 9 | Any other Mixed / Multiple ethnic background |
| 10 | Indian |
| 11 | Pakistani |
| 12 | Bangladeshi |
| 13 | Chinese |
| 14 | Any other Asian background |
| 15 | African |
| 16 | Caribbean |
| 17 | Any other Black, Black British, or Caribbean background |
| 18 | Arab |
| 19 | Any other ethnic group |
| 20 | Don’t think of myself as any of these |
| 21 | Prefer not to say |
| 22 | NA |
ethnicity_statistics_2 <- data %>%
group_by(outsourcing_group, Ethnicity_collapsed) %>%
summarise(
n = n(), # count cases
Frequency = sum(NatRepemployees) # count weighted cases
) %>%
mutate(
N = sum(n),
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum)
) #%>%
# separate_wider_delim(Ethnicity_short,
# names = c("Ethnicity_short", "Ethnicity detail"),
# delim = stringr::regex(" / |, "), # use multiple delims
# too_few = "align_start",
# too_many = "merge")
#readr::write_csv(ethnicity_statistics, file = "../outputs/data/ethnicity_stats_2.csv")ethnicity_statistics_2 %>%
# mutate(
# Ethnicity = haven::as_factor(Ethnicity)
# ) %>%
knitr::kable(.,digits = 2) %>%
kable_styling(full_width = F)| outsourcing_group | Ethnicity_collapsed | n | Frequency | N | Sum | Percentage |
|---|---|---|---|---|---|---|
| Not outsourced | White | 6734 | 6687.91 | 8472 | 8446.64 | 79.18 |
| Not outsourced | White other | 379 | 398.98 | 8472 | 8446.64 | 4.72 |
| Not outsourced | Black Caribbean | 158 | 109.78 | 8472 | 8446.64 | 1.30 |
| Not outsourced | Black African | 272 | 177.52 | 8472 | 8446.64 | 2.10 |
| Not outsourced | Mixed other | 120 | 65.99 | 8472 | 8446.64 | 0.78 |
| Not outsourced | South Asian | 310 | 387.30 | 8472 | 8446.64 | 4.59 |
| Not outsourced | East Asian | 70 | 124.50 | 8472 | 8446.64 | 1.47 |
| Not outsourced | Other | 381 | 448.60 | 8472 | 8446.64 | 5.31 |
| Not outsourced | Black other | 36 | 25.60 | 8472 | 8446.64 | 0.30 |
| Not outsourced | Arab | 12 | 20.47 | 8472 | 8446.64 | 0.24 |
| Outsourced | White | 754 | 789.42 | 1123 | 1161.08 | 67.99 |
| Outsourced | White other | 67 | 76.21 | 1123 | 1161.08 | 6.56 |
| Outsourced | Black Caribbean | 18 | 14.27 | 1123 | 1161.08 | 1.23 |
| Outsourced | Black African | 95 | 58.28 | 1123 | 1161.08 | 5.02 |
| Outsourced | Mixed other | 24 | 15.64 | 1123 | 1161.08 | 1.35 |
| Outsourced | South Asian | 76 | 94.60 | 1123 | 1161.08 | 8.15 |
| Outsourced | East Asian | 7 | 12.75 | 1123 | 1161.08 | 1.10 |
| Outsourced | Other | 66 | 85.46 | 1123 | 1161.08 | 7.36 |
| Outsourced | Black other | 13 | 9.46 | 1123 | 1161.08 | 0.81 |
| Outsourced | Arab | 3 | 4.97 | 1123 | 1161.08 | 0.43 |
| Likely agency | White | 181 | 174.99 | 269 | 266.54 | 65.65 |
| Likely agency | White other | 11 | 14.10 | 269 | 266.54 | 5.29 |
| Likely agency | Black Caribbean | 4 | 4.09 | 269 | 266.54 | 1.53 |
| Likely agency | Black African | 24 | 13.74 | 269 | 266.54 | 5.15 |
| Likely agency | Mixed other | 7 | 6.88 | 269 | 266.54 | 2.58 |
| Likely agency | South Asian | 19 | 24.18 | 269 | 266.54 | 9.07 |
| Likely agency | East Asian | 1 | 1.58 | 269 | 266.54 | 0.59 |
| Likely agency | Other | 19 | 22.39 | 269 | 266.54 | 8.40 |
| Likely agency | Black other | 1 | 1.16 | 269 | 266.54 | 0.44 |
| Likely agency | Arab | 2 | 3.42 | 269 | 266.54 | 1.28 |
| High indicators | White | 206 | 193.53 | 291 | 280.74 | 68.93 |
| High indicators | White other | 7 | 8.37 | 291 | 280.74 | 2.98 |
| High indicators | Black Caribbean | 4 | 2.28 | 291 | 280.74 | 0.81 |
| High indicators | Black African | 18 | 12.56 | 291 | 280.74 | 4.47 |
| High indicators | Mixed other | 4 | 2.50 | 291 | 280.74 | 0.89 |
| High indicators | South Asian | 30 | 32.08 | 291 | 280.74 | 11.43 |
| High indicators | East Asian | 4 | 6.70 | 291 | 280.74 | 2.39 |
| High indicators | Other | 14 | 17.93 | 291 | 280.74 | 6.39 |
| High indicators | Black other | 2 | 1.16 | 291 | 280.74 | 0.41 |
| High indicators | Arab | 2 | 3.63 | 291 | 280.74 | 1.29 |
data %>%
# get values of labels
# mutate_all(haven::as_factor) %>%
group_by(outsourcing_group, Ethnicity_collapsed) %>%
summarise(
Frequency = sum(NatRepemployees),
n = n()
) %>%
mutate(
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum),
N = n()
) %>%
ggplot(., aes(outsourcing_group, Percentage, fill = Ethnicity_collapsed)) +
geom_col(colour="black") +
annotate("text", x = ethnicity_statistics$outsourcing_group, y = 75, label = paste0("n=",ethnicity_statistics$N)) +
coord_flip() +
scale_fill_manual(values = many_colours, name = "Ethnicity") +
xlab("Outsourcing group") +
theme_minimal()# mod <- glm(outsourcing_status ~ Ethnicity_collapsed, data, family="binomial", weights=NatRepemployees)
# summary(mod)
#
# coef_table <- data.frame("estimate" = mod[["coefficients"]]) %>%
# mutate(
# or = round(exp(estimate),2)
# )ethnicity_summary_paysplit <- data %>%
group_by(outsourcing_status, income_group, Ethnicity_labelled) %>%
summarise(
n = n(),
Frequency = sum(NatRepemployees)
) %>%
mutate(
N = sum(n),
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum),
Ethnicity_short = Ethnicity_labelled
) %>%
separate_wider_delim(Ethnicity_short,
names = c("Ethnicity_short", "Ethnicity detail"),
delim = stringr::regex(" / |, "), # use multiple delims
too_few = "align_start",
too_many = "merge")
readr::write_csv(ethnicity_statistics, file = "../outputs/data/ethnicity_stats_paysplit_1.csv")temp_data <- ethnicity_summary_paysplit %>%
drop_na(income_group)
for(group in unique(temp_data$income_group)){
plot_data <- temp_data %>%
filter(income_group==group)
plot <- plot_data %>%
ggplot(., aes(Ethnicity_short, Percentage, fill = outsourcing_status)) +
geom_col(colour="black", position = position_dodge()) +
#annotate("text", x = ethnicity_statistics$outsourcing_status, y = 75, label = paste0("n=",ethnicity_statistics$Sum)) +
coord_flip() +
scale_fill_manual(values = many_colours, name = "Ethnicity") +
xlab("Outsourcing group") +
theme_minimal() +
theme(
legend.position = "bottom"
) +
ggtitle(paste0(group, " income"))
print(plot)
}# ethnicity_summary_paysplit %>%
# drop_na(income_group) %>%
# ggplot(., aes(outsourcing_status, Percentage, fill = Ethnicity_short)) +
# facet_grid(rows=~income_group) +
# geom_col(colour="black") +
# #annotate("text", x = ethnicity_statistics$outsourcing_status, y = 75, label = paste0("n=",ethnicity_statistics$Sum)) +
# coord_flip() +
# scale_fill_manual(values = many_colours, name = "Ethnicity") +
# xlab("Outsourcing group") +
# theme_minimal() +
# theme(
# legend.position = "bottom"
# )ethnicity_summary_paysplit <- data %>%
group_by(outsourcing_status, income_group, Ethnicity_collapsed) %>%
summarise(
n = n(),
Frequency = sum(NatRepemployees)
) %>%
mutate(
N = sum(n),
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum)
)
#readr::write_csv(ethnicity_statistics, file = "../outputs/data/ethnicity_stats_paysplit_1.csv")temp_data <- ethnicity_summary_paysplit %>%
drop_na(income_group)
for(group in unique(temp_data$income_group)){
plot_data <- temp_data %>%
filter(income_group==group)
plot <- plot_data %>%
ggplot(., aes(Ethnicity_collapsed, Percentage, fill = outsourcing_status)) +
geom_col(colour="black", position = position_dodge()) +
#annotate("text", x = ethnicity_statistics$outsourcing_status, y = 75, label = paste0("n=",ethnicity_statistics$Sum)) +
coord_flip() +
scale_fill_manual(values = many_colours, name = "Ethnicity") +
xlab("Outsourcing group") +
theme_minimal() +
theme(
legend.position = "bottom"
) +
ggtitle(paste0(group, " income"))
print(plot)
}# ethnicity_summary_paysplit %>%
# drop_na(income_group) %>%
# ggplot(., aes(outsourcing_status, Percentage, fill = Ethnicity_short)) +
# facet_grid(rows=~income_group) +
# geom_col(colour="black") +
# #annotate("text", x = ethnicity_statistics$outsourcing_status, y = 75, label = paste0("n=",ethnicity_statistics$Sum)) +
# coord_flip() +
# scale_fill_manual(values = many_colours, name = "Ethnicity") +
# xlab("Outsourcing group") +
# theme_minimal() +
# theme(
# legend.position = "bottom"
# )mod <- glm(outsourcing_status ~ income_group*Ethnicity_collapsed, data, family="binomial")
summary(mod)
Call:
glm(formula = outsourcing_status ~ income_group * Ethnicity_collapsed,
family = "binomial", data = data)
Coefficients:
Estimate Std. Error z value
(Intercept) -1.83224 0.04438 -41.286
income_groupLow 0.17246 0.06883 2.506
Ethnicity_collapsedWhite other 0.33035 0.16107 2.051
Ethnicity_collapsedBlack Caribbean -0.19371 0.31031 -0.624
Ethnicity_collapsedBlack African 1.23328 0.13921 8.859
Ethnicity_collapsedMixed other 0.23623 0.28662 0.824
Ethnicity_collapsedSouth Asian 0.92653 0.14299 6.480
Ethnicity_collapsedEast Asian 0.17932 0.36655 0.489
Ethnicity_collapsedOther 0.42737 0.18195 2.349
Ethnicity_collapsedBlack other 1.19039 0.39319 3.027
Ethnicity_collapsedArab 1.32142 0.51830 2.550
income_groupLow:Ethnicity_collapsedWhite other -0.13691 0.27257 -0.502
income_groupLow:Ethnicity_collapsedBlack Caribbean 0.55421 0.45290 1.224
income_groupLow:Ethnicity_collapsedBlack African -0.44101 0.23655 -1.864
income_groupLow:Ethnicity_collapsedMixed other 0.64585 0.41324 1.563
income_groupLow:Ethnicity_collapsedSouth Asian -0.01517 0.23811 -0.064
income_groupLow:Ethnicity_collapsedEast Asian -0.12898 0.73289 -0.176
income_groupLow:Ethnicity_collapsedOther 0.33341 0.27581 1.209
income_groupLow:Ethnicity_collapsedBlack other -0.70927 0.69591 -1.019
income_groupLow:Ethnicity_collapsedArab -0.35479 1.33094 -0.267
Pr(>|z|)
(Intercept) < 2e-16 ***
income_groupLow 0.01222 *
Ethnicity_collapsedWhite other 0.04027 *
Ethnicity_collapsedBlack Caribbean 0.53246
Ethnicity_collapsedBlack African < 2e-16 ***
Ethnicity_collapsedMixed other 0.40983
Ethnicity_collapsedSouth Asian 9.19e-11 ***
Ethnicity_collapsedEast Asian 0.62469
Ethnicity_collapsedOther 0.01883 *
Ethnicity_collapsedBlack other 0.00247 **
Ethnicity_collapsedArab 0.01079 *
income_groupLow:Ethnicity_collapsedWhite other 0.61548
income_groupLow:Ethnicity_collapsedBlack Caribbean 0.22107
income_groupLow:Ethnicity_collapsedBlack African 0.06228 .
income_groupLow:Ethnicity_collapsedMixed other 0.11808
income_groupLow:Ethnicity_collapsedSouth Asian 0.94921
income_groupLow:Ethnicity_collapsedEast Asian 0.86030
income_groupLow:Ethnicity_collapsedOther 0.22673
income_groupLow:Ethnicity_collapsedBlack other 0.30811
income_groupLow:Ethnicity_collapsedArab 0.78980
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 8127.6 on 8942 degrees of freedom
Residual deviance: 7956.9 on 8923 degrees of freedom
(1212 observations deleted due to missingness)
AIC: 7996.9
Number of Fisher Scoring iterations: 4
test_data <- data %>%
drop_na(income_group) %>%
mutate(
income_group = factor(income_group, levels = c("Not low", "Low")),
Ethnicity_short = Ethnicity_labelled
) %>%
separate_wider_delim(Ethnicity_short,
names = c("Ethnicity_short", "Ethnicity detail"),
delim = " / ",
too_few = "align_start",
too_many = "merge") %>%
mutate(
Ethnicity_short = forcats::fct_relevel(factor(Ethnicity_short), "English")
)
test <- glm(outsourcing_status ~ Age + Gender + income_group + Ethnicity_short, family = "binomial", data = test_data)
summary(test)
Call:
glm(formula = outsourcing_status ~ Age + Gender + income_group +
Ethnicity_short, family = "binomial", data = test_data)
Coefficients:
Estimate
(Intercept) -1.122623
Age -0.024661
GenderMale 0.531731
GenderOther 0.128987
GenderPrefer not to say -0.240452
income_groupLow 0.276827
Ethnicity_shortAfrican 0.750804
Ethnicity_shortAny other Asian background 0.691922
Ethnicity_shortAny other Black, Black British, or Caribbean background 0.752002
Ethnicity_shortAny other ethnic group 0.220800
Ethnicity_shortAny other Mixed 0.673451
Ethnicity_shortAny other White background 0.171750
Ethnicity_shortArab 0.989803
Ethnicity_shortBangladeshi 0.703248
Ethnicity_shortCaribbean 0.183752
Ethnicity_shortChinese 0.021374
Ethnicity_shortDon’t think of myself as any of these 1.449163
Ethnicity_shortGypsy or Irish Traveller 0.282974
Ethnicity_shortIndian 0.548848
Ethnicity_shortIrish -0.126226
Ethnicity_shortPakistani 0.864916
Ethnicity_shortPrefer not to say -0.006321
Ethnicity_shortRoma 1.082635
Ethnicity_shortWhite and Asian -0.090237
Ethnicity_shortWhite and Black African 1.155726
Ethnicity_shortWhite and Black Caribbean -0.279236
Std. Error
(Intercept) 0.111181
Age 0.002392
GenderMale 0.061348
GenderOther 0.780590
GenderPrefer not to say 1.056849
income_groupLow 0.061893
Ethnicity_shortAfrican 0.125846
Ethnicity_shortAny other Asian background 0.258347
Ethnicity_shortAny other Black, Black British, or Caribbean background 0.330047
Ethnicity_shortAny other ethnic group 0.647274
Ethnicity_shortAny other Mixed 0.264347
Ethnicity_shortAny other White background 0.135112
Ethnicity_shortArab 0.483591
Ethnicity_shortBangladeshi 0.286337
Ethnicity_shortCaribbean 0.300048
Ethnicity_shortChinese 0.319975
Ethnicity_shortDon’t think of myself as any of these 0.659571
Ethnicity_shortGypsy or Irish Traveller 0.843433
Ethnicity_shortIndian 0.163023
Ethnicity_shortIrish 0.292763
Ethnicity_shortPakistani 0.188944
Ethnicity_shortPrefer not to say 0.633334
Ethnicity_shortRoma 0.769140
Ethnicity_shortWhite and Asian 0.335327
Ethnicity_shortWhite and Black African 0.262938
Ethnicity_shortWhite and Black Caribbean 0.345023
z value
(Intercept) -10.097
Age -10.311
GenderMale 8.667
GenderOther 0.165
GenderPrefer not to say -0.228
income_groupLow 4.473
Ethnicity_shortAfrican 5.966
Ethnicity_shortAny other Asian background 2.678
Ethnicity_shortAny other Black, Black British, or Caribbean background 2.278
Ethnicity_shortAny other ethnic group 0.341
Ethnicity_shortAny other Mixed 2.548
Ethnicity_shortAny other White background 1.271
Ethnicity_shortArab 2.047
Ethnicity_shortBangladeshi 2.456
Ethnicity_shortCaribbean 0.612
Ethnicity_shortChinese 0.067
Ethnicity_shortDon’t think of myself as any of these 2.197
Ethnicity_shortGypsy or Irish Traveller 0.336
Ethnicity_shortIndian 3.367
Ethnicity_shortIrish -0.431
Ethnicity_shortPakistani 4.578
Ethnicity_shortPrefer not to say -0.010
Ethnicity_shortRoma 1.408
Ethnicity_shortWhite and Asian -0.269
Ethnicity_shortWhite and Black African 4.395
Ethnicity_shortWhite and Black Caribbean -0.809
Pr(>|z|)
(Intercept) < 2e-16
Age < 2e-16
GenderMale < 2e-16
GenderOther 0.868753
GenderPrefer not to say 0.820021
income_groupLow 7.73e-06
Ethnicity_shortAfrican 2.43e-09
Ethnicity_shortAny other Asian background 0.007400
Ethnicity_shortAny other Black, Black British, or Caribbean background 0.022698
Ethnicity_shortAny other ethnic group 0.733011
Ethnicity_shortAny other Mixed 0.010847
Ethnicity_shortAny other White background 0.203669
Ethnicity_shortArab 0.040680
Ethnicity_shortBangladeshi 0.014049
Ethnicity_shortCaribbean 0.540267
Ethnicity_shortChinese 0.946741
Ethnicity_shortDon’t think of myself as any of these 0.028011
Ethnicity_shortGypsy or Irish Traveller 0.737246
Ethnicity_shortIndian 0.000761
Ethnicity_shortIrish 0.666357
Ethnicity_shortPakistani 4.70e-06
Ethnicity_shortPrefer not to say 0.992037
Ethnicity_shortRoma 0.159252
Ethnicity_shortWhite and Asian 0.787852
Ethnicity_shortWhite and Black African 1.11e-05
Ethnicity_shortWhite and Black Caribbean 0.418329
(Intercept) ***
Age ***
GenderMale ***
GenderOther
GenderPrefer not to say
income_groupLow ***
Ethnicity_shortAfrican ***
Ethnicity_shortAny other Asian background **
Ethnicity_shortAny other Black, Black British, or Caribbean background *
Ethnicity_shortAny other ethnic group
Ethnicity_shortAny other Mixed *
Ethnicity_shortAny other White background
Ethnicity_shortArab *
Ethnicity_shortBangladeshi *
Ethnicity_shortCaribbean
Ethnicity_shortChinese
Ethnicity_shortDon’t think of myself as any of these *
Ethnicity_shortGypsy or Irish Traveller
Ethnicity_shortIndian ***
Ethnicity_shortIrish
Ethnicity_shortPakistani ***
Ethnicity_shortPrefer not to say
Ethnicity_shortRoma
Ethnicity_shortWhite and Asian
Ethnicity_shortWhite and Black African ***
Ethnicity_shortWhite and Black Caribbean
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 7918.6 on 8739 degrees of freedom
Residual deviance: 7567.2 on 8714 degrees of freedom
(203 observations deleted due to missingness)
AIC: 7619.2
Number of Fisher Scoring iterations: 4
Another way of looking at this is to calculate, for each ethnicity, the proportion of workers in each outsourcing group. Doing so yields the plot below.16
ethnicity_statistics <- data %>%
# get values of labels
# mutate_all(haven::as_factor) %>%
group_by(Ethnicity, outsourcing_status) %>%
summarise(
Frequency = n()
) %>%
mutate(
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum)
) %>%
rename(
`Outsourcing group` = outsourcing_status
)
ethnicity_statistics %>%
mutate(
Ethnicity = haven::as_factor(Ethnicity)
) %>%
knitr::kable(.,digits = 2) %>%
kable_styling(full_width = F)| Ethnicity | Outsourcing group | Frequency | Sum | Percentage |
|---|---|---|---|---|
| English / Welsh / Scottish / Northern Irish / British | Not outsourced | 6623 | 7747 | 85.49 |
| English / Welsh / Scottish / Northern Irish / British | Outsourced | 1124 | 7747 | 14.51 |
| Irish | Not outsourced | 111 | 128 | 86.72 |
| Irish | Outsourced | 17 | 128 | 13.28 |
| Gypsy or Irish Traveller | Not outsourced | 6 | 8 | 75.00 |
| Gypsy or Irish Traveller | Outsourced | 2 | 8 | 25.00 |
| Roma | Not outsourced | 4 | 7 | 57.14 |
| Roma | Outsourced | 3 | 7 | 42.86 |
| Any other White background | Not outsourced | 369 | 449 | 82.18 |
| Any other White background | Outsourced | 80 | 449 | 17.82 |
| White and Black Caribbean | Not outsourced | 84 | 95 | 88.42 |
| White and Black Caribbean | Outsourced | 11 | 95 | 11.58 |
| White and Black African | Not outsourced | 40 | 66 | 60.61 |
| White and Black African | Outsourced | 26 | 66 | 39.39 |
| White and Asian | Not outsourced | 61 | 73 | 83.56 |
| White and Asian | Outsourced | 12 | 73 | 16.44 |
| Any other Mixed / Multiple ethnic background | Not outsourced | 59 | 82 | 71.95 |
| Any other Mixed / Multiple ethnic background | Outsourced | 23 | 82 | 28.05 |
| Indian | Not outsourced | 170 | 225 | 75.56 |
| Indian | Outsourced | 55 | 225 | 24.44 |
| Pakistani | Not outsourced | 93 | 142 | 65.49 |
| Pakistani | Outsourced | 49 | 142 | 34.51 |
| Bangladeshi | Not outsourced | 47 | 68 | 69.12 |
| Bangladeshi | Outsourced | 21 | 68 | 30.88 |
| Chinese | Not outsourced | 70 | 82 | 85.37 |
| Chinese | Outsourced | 12 | 82 | 14.63 |
| Any other Asian background | Not outsourced | 57 | 83 | 68.67 |
| Any other Asian background | Outsourced | 26 | 83 | 31.33 |
| African | Not outsourced | 232 | 343 | 67.64 |
| African | Outsourced | 111 | 343 | 32.36 |
| Caribbean | Not outsourced | 74 | 89 | 83.15 |
| Caribbean | Outsourced | 15 | 89 | 16.85 |
| Any other Black, Black British, or Caribbean background | Not outsourced | 36 | 52 | 69.23 |
| Any other Black, Black British, or Caribbean background | Outsourced | 16 | 52 | 30.77 |
| Arab | Not outsourced | 12 | 19 | 63.16 |
| Arab | Outsourced | 7 | 19 | 36.84 |
| Any other ethnic group | Not outsourced | 13 | 16 | 81.25 |
| Any other ethnic group | Outsourced | 3 | 16 | 18.75 |
| Don’t think of myself as any of these | Not outsourced | 7 | 12 | 58.33 |
| Don’t think of myself as any of these | Outsourced | 5 | 12 | 41.67 |
| Prefer not to say | Not outsourced | 22 | 26 | 84.62 |
| Prefer not to say | Outsourced | 4 | 26 | 15.38 |
| NA | Not outsourced | 282 | 343 | 82.22 |
| NA | Outsourced | 61 | 343 | 17.78 |
# ordering_df <- ethnicity_statistics %>%
# filter(`Outsourcing group` == "Outsourced") %>%
# mutate(
# Ethnicity = haven::as_factor(Ethnicity),
# Ethnicity = factor(Ethnicity),
# Ethnicity = forcats::fct_reorder(Ethnicity, Percentage)
# )
ethnicity_statistics %>%
mutate(
Ethnicity = haven::as_factor(Ethnicity)
) %>%
ggplot(., aes(Ethnicity, Percentage, fill = `Outsourcing group`)) +
geom_col(colour="black") +
annotate("text", x = ethnicity_statistics$Ethnicity, y = 75, label = paste0("n=", ethnicity_statistics$Sum)) +
coord_flip() +
scale_fill_manual(values=many_colours, name = "Ethnicity")write_csv(ethnicity_statistics, file="../outputs/data/ethnicity_stats_2.csv")bornuk_statistics <- data %>%
# get values of labels
mutate_all(haven::as_factor) %>%
group_by(outsourcing_status, BORNUK) %>%
summarise(
Frequency = n()
) %>%
mutate(
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum)
)
readr::write_csv(bornuk_statistics, file="../outputs/data/arrival_in_UK_stats.csv")categories <- as.vector(unique(haven::as_factor(data$BORNUK)))
non_categories <- categories[!(categories %in% "I was born in the UK")]
# Will throw NA warning. I think this OK but investigate how to avoid the problem
summary_table <- data %>%
mutate(
BORNUK = haven::as_factor(BORNUK)
) %>%
mutate(
BORNUK = forcats::fct_collapse(as.character(BORNUK),
"Born in UK" = "I was born in the UK",
"Not born in UK" = non_categories)
) %>%
group_by(outsourcing_status, BORNUK) %>%
summarise(
n = n()
) %>%
mutate(
Sum = sum(n),
Percentage = 100 * (n / Sum)
)
domain <- "BORNUK"
category_1 <- "Born in UK"
category_2 <- "Not born in UK"
group_1 <- t(tibble("present"=summary_table[which(summary_table[domain]==category_1 &
summary_table["outsourcing_status"]=="Outsourced"),"n"],
"not present" = summary_table[which(summary_table[domain]==category_2 &
summary_table["outsourcing_status"]=="Outsourced"),"n"]
))
group_2 <- t(tibble("present"=summary_table[which(summary_table[domain]==category_1 &
summary_table["outsourcing_status"]=="Not outsourced"),"n"],
"not present" = summary_table[which(summary_table[domain]==category_2 &
summary_table["outsourcing_status"]=="Not outsourced"),"n"]
))
comp_mat <- as.matrix(cbind(group_2, group_1)) # matrix for crosstable
x2 <- gmodels::CrossTable(comp_mat, fisher=TRUE, chisq = TRUE)
# (chi-square = `r round(x2[["chisq"]][["statistic"]][["X-squared"]],2)`, *p* = `r ifelse(x2[["chisq"]][["p.value"]] < .001, "< .001", paste0("= ", round(``x2[["chisq"]][["p.value"]],2))`).A greater proportion of outsourced workers were not born in the UK (24.06%) compared to non-outsourced workers (13.6%).17 This difference is statistically significant; outsourced workers are 2.01 times more likely to have been born outside the UK than non-outsourced workers.
Looking at the figure below, it appears that no particular arrival time is especially common amongst the outsourced group, with a relatively equal distribution across arrival times (though potentially a slightly larger proportion fall into the ‘Within the last 10 years category’). The is broadly the case for the likely agency and high indicators groups too, though note that amongst likely agency there is a slightly larger proportion of workers who have arrived within the last year.
bornuk_statistics %>%
knitr::kable(.,digits = 2) %>%
kable_styling(full_width = F)| outsourcing_status | BORNUK | Frequency | Sum | Percentage |
|---|---|---|---|---|
| Not outsourced | I was born in the UK | 7320 | 8472 | 86.40 |
| Not outsourced | Within the last year | 115 | 8472 | 1.36 |
| Not outsourced | Within the last 3 years | 135 | 8472 | 1.59 |
| Not outsourced | Within the last 5 years | 116 | 8472 | 1.37 |
| Not outsourced | Within the last 10 years | 183 | 8472 | 2.16 |
| Not outsourced | Within the last 15 years | 137 | 8472 | 1.62 |
| Not outsourced | Within the last 20 years | 127 | 8472 | 1.50 |
| Not outsourced | Within the last 30 years | 103 | 8472 | 1.22 |
| Not outsourced | More than 30 years ago | 149 | 8472 | 1.76 |
| Not outsourced | Prefer not to say | 87 | 8472 | 1.03 |
| Outsourced | I was born in the UK | 1278 | 1683 | 75.94 |
| Outsourced | Within the last year | 65 | 1683 | 3.86 |
| Outsourced | Within the last 3 years | 45 | 1683 | 2.67 |
| Outsourced | Within the last 5 years | 40 | 1683 | 2.38 |
| Outsourced | Within the last 10 years | 71 | 1683 | 4.22 |
| Outsourced | Within the last 15 years | 53 | 1683 | 3.15 |
| Outsourced | Within the last 20 years | 44 | 1683 | 2.61 |
| Outsourced | Within the last 30 years | 14 | 1683 | 0.83 |
| Outsourced | More than 30 years ago | 39 | 1683 | 2.32 |
| Outsourced | Prefer not to say | 34 | 1683 | 2.02 |
bornuk_statistics %>%
ggplot(., aes(outsourcing_status, Percentage, fill = BORNUK)) +
geom_col(colour="black") +
annotate("text", x = bornuk_statistics$outsourcing_status, y = 75, label = paste0("n=",bornuk_statistics$Sum)) +
coord_flip() +
scale_fill_manual(values=many_colours, name="Arrival in UK") +
theme_minimal() +
xlab("Outsourcing group") bornuk_summary_paysplit <- data %>%
group_by(outsourcing_status, income_group, BORNUK_labelled) %>%
summarise(
n = n(),
Frequency = sum(NatRepemployees)
) %>%
mutate(
N = sum(n),
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum)
)
readr::write_csv(bornuk_summary_paysplit, file = "../outputs/data/bornuk_stats_paysplit_1.csv")mod <- glm(outsourcing_status ~ income_group * BORNUK_labelled, data, weights = NatRepemployees, family="binomial")
summary(mod)
Call:
glm(formula = outsourcing_status ~ income_group * BORNUK_labelled,
family = "binomial", data = data, weights = NatRepemployees)
Coefficients:
Estimate Std. Error
(Intercept) -1.80357 0.04344
income_groupLow 0.22862 0.06439
BORNUK_labelledWithin the last year 1.28978 0.25307
BORNUK_labelledWithin the last 3 years 0.68932 0.23063
BORNUK_labelledWithin the last 5 years 0.54565 0.24814
BORNUK_labelledWithin the last 10 years 0.74566 0.18405
BORNUK_labelledWithin the last 15 years 1.05016 0.18554
BORNUK_labelledWithin the last 20 years 0.99745 0.20348
BORNUK_labelledWithin the last 30 years -1.33247 0.53396
BORNUK_labelledMore than 30 years ago 0.43879 0.23670
BORNUK_labelledPrefer not to say 0.84084 0.44469
income_groupLow:BORNUK_labelledWithin the last year -0.38239 0.35255
income_groupLow:BORNUK_labelledWithin the last 3 years -0.41813 0.39148
income_groupLow:BORNUK_labelledWithin the last 5 years 0.12118 0.42110
income_groupLow:BORNUK_labelledWithin the last 10 years 0.24893 0.28633
income_groupLow:BORNUK_labelledWithin the last 15 years -0.95025 0.38000
income_groupLow:BORNUK_labelledWithin the last 20 years -0.84991 0.43888
income_groupLow:BORNUK_labelledWithin the last 30 years 1.75447 0.69940
income_groupLow:BORNUK_labelledMore than 30 years ago 0.54466 0.35281
income_groupLow:BORNUK_labelledPrefer not to say -0.12256 0.63991
z value Pr(>|z|)
(Intercept) -41.516 < 2e-16 ***
income_groupLow 3.551 0.000384 ***
BORNUK_labelledWithin the last year 5.096 3.46e-07 ***
BORNUK_labelledWithin the last 3 years 2.989 0.002800 **
BORNUK_labelledWithin the last 5 years 2.199 0.027883 *
BORNUK_labelledWithin the last 10 years 4.051 5.09e-05 ***
BORNUK_labelledWithin the last 15 years 5.660 1.51e-08 ***
BORNUK_labelledWithin the last 20 years 4.902 9.49e-07 ***
BORNUK_labelledWithin the last 30 years -2.495 0.012579 *
BORNUK_labelledMore than 30 years ago 1.854 0.063776 .
BORNUK_labelledPrefer not to say 1.891 0.058647 .
income_groupLow:BORNUK_labelledWithin the last year -1.085 0.278074
income_groupLow:BORNUK_labelledWithin the last 3 years -1.068 0.285484
income_groupLow:BORNUK_labelledWithin the last 5 years 0.288 0.773525
income_groupLow:BORNUK_labelledWithin the last 10 years 0.869 0.384622
income_groupLow:BORNUK_labelledWithin the last 15 years -2.501 0.012396 *
income_groupLow:BORNUK_labelledWithin the last 20 years -1.937 0.052802 .
income_groupLow:BORNUK_labelledWithin the last 30 years 2.509 0.012123 *
income_groupLow:BORNUK_labelledMore than 30 years ago 1.544 0.122644
income_groupLow:BORNUK_labelledPrefer not to say -0.192 0.848114
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 8150.5 on 8942 degrees of freedom
Residual deviance: 7991.6 on 8923 degrees of freedom
(1212 observations deleted due to missingness)
AIC: 8937.4
Number of Fisher Scoring iterations: 5
# To me this indicates that htere is main effect - arrival time pre
sjPlot::plot_model(mod, type = "int", legend.title = "")Results of a glm suggest that any arrival time positively predicts outsroucing status, apart from ‘within the last 15 years’ and ‘within the last 30 years’. Takeaway is that people having migrated in the past 20 years are more likely to be outsourced than people born in the uk. People having migrated in the past 15 years are less likely to be outsourced if they’re in the low income group, whilst people having migrated in the past 30 years are more likely to be outsourced if they’re in the low income group. I would take caution in interpreting these interaction results in isolation as they may be influence by other factors (e.g., ethnicity).
We should test this with a more complex model that includes covariates
The plot below shows the percentage of outrouced and non-outsourced people by income group and arrival time.
temp_data <- bornuk_summary_paysplit %>%
drop_na(income_group)
for(group in unique(temp_data$income_group)){
plot_data <- temp_data %>%
filter(income_group==group)
plot <- plot_data %>%
ggplot(., aes(BORNUK_labelled, Percentage, fill = outsourcing_status)) +
geom_col(colour="black", position = position_dodge()) +
#annotate("text", x = ethnicity_statistics$outsourcing_status, y = 75, label = paste0("n=",ethnicity_statistics$Sum)) +
coord_flip() +
scale_fill_manual(values = many_colours, name = "Ethnicity") +
xlab("Outsourcing group") +
theme_minimal() +
theme(
legend.position = "bottom"
) +
ggtitle(paste0(group, " income"))
print(plot)
}# ethnicity_summary_paysplit %>%
# drop_na(income_group) %>%
# ggplot(., aes(outsourcing_status, Percentage, fill = Ethnicity_short)) +
# facet_grid(rows=~income_group) +
# geom_col(colour="black") +
# #annotate("text", x = ethnicity_statistics$outsourcing_status, y = 75, label = paste0("n=",ethnicity_statistics$Sum)) +
# coord_flip() +
# scale_fill_manual(values = many_colours, name = "Ethnicity") +
# xlab("Outsourcing group") +
# theme_minimal() +
# theme(
# legend.position = "bottom"
# )temp_df <- data %>%
# filter(outsourcing_status=="Outsourced") %>%
mutate(
Ethnicity = haven::as_factor(Ethnicity),
BORNUK = haven::as_factor(BORNUK)
)
# mytable <- table(temp_df$BORNUK,temp_df$Ethnicity)
# tab <- as.data.frame(prop.table(mytable))
# int_summary <- temp_df %>%
# group_by(BORNUK, Ethnicity) %>%
# summarise(
# Frequency = sum(NatRepemployees)
# ) %>%
# mutate(
# Percentage = 100 * (Frequency/sum(Frequency))
# )
#
# int_summary %>%
# ggplot(., aes(BORNUK, Percentage, fill = Ethnicity)) +
# geom_col() +
# coord_flip() +
# theme(legend.position = "none") +
# scale_fill_manual(values = many_colours)
#
# int_summary %>%
# ggplot(., aes(BORNUK, Percentage, fill = Ethnicity)) +
# geom_col() +
# coord_flip() +
# scale_fill_manual(values = many_colours)
# # theme(legend.position = "none")
int_summary_2 <- temp_df %>%
group_by(outsourcing_status, Ethnicity, BORNUK) %>%
summarise(
Frequency = sum(NatRepemployees)
) %>%
mutate(
Percentage = 100 * (Frequency/sum(Frequency))
)
int_summary_2 %>%
ggplot(., aes(Ethnicity, Percentage, fill = BORNUK)) +
facet_grid(rows=vars(outsourcing_status)) +
geom_col() +
coord_flip() +
theme(legend.position = "none") +
scale_fill_manual(values = many_colours)int_summary_2 %>%
ggplot(., aes(Ethnicity, Percentage, fill = BORNUK)) +
facet_grid(rows=vars(outsourcing_status)) +
geom_col() +
coord_flip() +
scale_fill_manual(values = many_colours) # theme(legend.position = "none")
readr::write_csv(int_summary_2, file="../outputs/data/interaction_ethnicity_arrival_in_UK_stats.csv")For the following ethnicites, more than 50% were born in the UK:
For the following ethnicities around 50% were born in the UK, with remaining 50% split across arrival times:
For the following ethnicities, less than 50% were born in the UK:
# filter(outsourcing_status=="Outsourced") %>%
# mutate(
# Ethnicity = haven::as_factor(Ethnicity),
# BORNUK = haven::as_factor(BORNUK)
# )
# mytable <- table(temp_df$BORNUK,temp_df$Ethnicity)
# tab <- as.data.frame(prop.table(mytable))
# int_summary <- temp_df %>%
# group_by(BORNUK, Ethnicity) %>%
# summarise(
# Frequency = sum(NatRepemployees)
# ) %>%
# mutate(
# Percentage = 100 * (Frequency/sum(Frequency))
# )
#
# int_summary %>%
# ggplot(., aes(BORNUK, Percentage, fill = Ethnicity)) +
# geom_col() +
# coord_flip() +
# theme(legend.position = "none") +
# scale_fill_manual(values = many_colours)
#
# int_summary %>%
# ggplot(., aes(BORNUK, Percentage, fill = Ethnicity)) +
# geom_col() +
# coord_flip() +
# scale_fill_manual(values = many_colours)
# # theme(legend.position = "none")
int_summary_2 <- data %>%
group_by(outsourcing_status, Ethnicity_collapsed, BORNUK_labelled) %>%
summarise(
Frequency = sum(NatRepemployees)
) %>%
mutate(
Percentage = 100 * (Frequency/sum(Frequency))
)
# int_summary_2 %>%
# ggplot(., aes(Ethnicity_collapsed, Percentage, fill = BORNUK_labelled)) +
# facet_grid(rows=vars(outsourcing_status)) +
# geom_col() +
# coord_flip() +
# theme(legend.position = "none") +
# scale_fill_manual(values = many_colours)
int_summary_2 %>%
ggplot(., aes(Ethnicity_collapsed, Percentage, fill = BORNUK_labelled)) +
facet_grid(rows=vars(outsourcing_status)) +
geom_col() +
coord_flip() +
scale_fill_manual(values = many_colours) # theme(legend.position = "none")data <- data %>%
mutate(
BORNUK_collapsed = forcats::fct_collapse(BORNUK_labelled,
"Born in UK" = "I was born in the UK",
"Came to UK recently" = c("Within the last year",
"Within the last 3 years",
"Within the last 5 years",
"Within the last 10 years"),
"Came to UK not recently" = c("Within the last 15 years",
"Within the last 20 years",
"Within the last 30 years",
"More than 30 years ago"),
"Prefer not to say" = c("Prefer not to say")
)
)
int_summary_3 <- data %>%
group_by(outsourcing_status, Ethnicity_collapsed, BORNUK_collapsed) %>%
summarise(
Frequency = sum(NatRepemployees)
) %>%
mutate(
Percentage = 100 * (Frequency/sum(Frequency))
)
int_summary_3 %>%
ggplot(., aes(Ethnicity_collapsed, Percentage, fill = BORNUK_collapsed)) +
facet_grid(rows=vars(outsourcing_status)) +
geom_col() +
coord_flip() +
scale_fill_manual(values = many_colours)# mod <- glm(outsourcing_status ~ Ethnicity_collapsed*BORNUK_collapsed, data, family="binomial", weight = NatRepemployees)
# summary(mod)
# emmeans(mod, specs = "Ethnicity_collapsed", by ="BORNUK_collapsed")
# sjPlot::plot_model(mod, type = "int", legend.title = "", terms = c("outsourcing_status","BORNUK_collapsed","Ethnicity_collapsed ['Black African]"))
# gender_statistics <- data %>%
# get values of labels
mutate_all(haven::as_factor) %>%
group_by(outsourcing_status, Gender) %>%
summarise(
Frequency = n()
) %>%
mutate(
Percentage = 100 * (Frequency / sum(Frequency))
)
readr::write_csv(gender_statistics, file="../outputs/data/gender_statistics.csv")gender_statistics %>%
ggplot(., aes(outsourcing_status, Percentage, fill = Gender)) +
geom_col(colour="black") +
# annotate("text", x = gender_statistics$outsourcing_status, y = 75, label = paste0("n=", gender_statistics$Frequency)) +
coord_flip() +
scale_fill_manual(values=colours) +
theme_minimal() +
xlab("Outsourcing group") categories <- as.vector(unique(haven::as_factor(data$Gender)))
non_categories <- categories[!(categories %in% "Male")]
# Will throw NA warning. I think this OK but investigate how to avoid the problem
summary_table <- data %>%
mutate(
Gender = haven::as_factor(Gender)
) %>%
mutate(
Gender = forcats::fct_collapse(as.character(Gender),
"Male" = "Male",
"Not male" = non_categories)
) %>%
group_by(outsourcing_status, Gender) %>%
summarise(
n = n()
) %>%
mutate(
Sum = sum(n),
Percentage = 100 * (n / Sum)
)
domain <- "Gender"
category_1 <- "Male"
category_2 <- "Not male"
group_1 <- t(tibble("present"=summary_table[which(summary_table[domain]==category_1 &
summary_table["outsourcing_status"]=="Not outsourced"),"n"],
"not present" = summary_table[which(summary_table[domain]==category_2 &
summary_table["outsourcing_status"]=="Not outsourced"),"n"]
))
group_2 <- t(tibble("present"=summary_table[which(summary_table[domain]==category_1 &
summary_table["outsourcing_status"]=="Outsourced"),"n"],
"not present" = summary_table[which(summary_table[domain]==category_2 &
summary_table["outsourcing_status"]=="Outsourced"),"n"]
))
comp_mat <- as.matrix(cbind(group_2, group_1)) # matrix for crosstable
x2 <- gmodels::CrossTable(comp_mat, fisher=TRUE, chisq = TRUE)In terms of Gender, the outsourced group has a larger proportion of males (57.81% compared to 46.4%). This difference is statistically significant; outsourced workers are 1.58 times more likely to have be male than non-outsourced workers.
gender_summary_paysplit <- data %>%
group_by(outsourcing_status, income_group, Gender) %>%
summarise(
n = n(),
Frequency = sum(NatRepemployees)
) %>%
mutate(
N = sum(n),
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum)
)
readr::write_csv(ethnicity_statistics, file = "../outputs/data/gender_stats_paysplit_1.csv")mod <- glm(outsourcing_status ~ Gender * income_group, data, family="binomial", weights = NatRepemployees)
summary(mod)
Call:
glm(formula = outsourcing_status ~ Gender * income_group, family = "binomial",
data = data, weights = NatRepemployees)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.01977 0.06791 -29.741 < 2e-16
GenderMale 0.56260 0.08168 6.888 5.67e-12
GenderOther 0.44378 0.96305 0.461 0.6449
GenderPrefer not to say 0.65109 0.69214 0.941 0.3469
income_groupLow 0.43123 0.08751 4.928 8.33e-07
GenderMale:income_groupLow -0.24070 0.11969 -2.011 0.0443
GenderOther:income_groupLow -0.10606 1.38352 -0.077 0.9389
GenderPrefer not to say:income_groupLow -0.65321 1.10050 -0.594 0.5528
(Intercept) ***
GenderMale ***
GenderOther
GenderPrefer not to say
income_groupLow ***
GenderMale:income_groupLow *
GenderOther:income_groupLow
GenderPrefer not to say:income_groupLow
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 8150.5 on 8942 degrees of freedom
Residual deviance: 8076.3 on 8935 degrees of freedom
(1212 observations deleted due to missingness)
AIC: 9001.4
Number of Fisher Scoring iterations: 4
sjPlot::plot_model(mod, type = "int")A glm finds that income group is a significant factor for females but not for males. Whilst males are more likely than females overall to be outsourced, females are significantly more likely to be outsourced if they are in the low income group than if they are in the not-low income group. The plots below show the percentage of outsourced workers by income group and gender.
temp_data <- gender_summary_paysplit %>%
drop_na(income_group)
for(group in unique(temp_data$income_group)){
plot_data <- temp_data %>%
filter(income_group==group)
plot <- plot_data %>%
ggplot(., aes(Gender, Percentage, fill = outsourcing_status)) +
geom_col(colour="black", position = position_dodge()) +
#annotate("text", x = ethnicity_statistics$outsourcing_status, y = 75, label = paste0("n=",ethnicity_statistics$Sum)) +
coord_flip() +
scale_fill_manual(values = many_colours, name = "Ethnicity") +
xlab("Outsourcing group") +
theme_minimal() +
theme(
legend.position = "bottom"
) +
ggtitle(paste0(group, " income"))
print(plot)
}Let’s cross check the size of the employed workforce across regions, and compare this to how many people are in each region in our sample. The percentages should work out the same if they’re weighted.
The tables below show that our sample is weighted by region. The weighted percentage of our sampled workers in each region matches the percentages from the ONS employment by region tables. This means that the weighted percentage of workers (and therefore outsourced workers) in our sample can be considered to be representative of the national picture.
our_sample <- data %>%
group_by(Region) %>%
summarise(
n = n(),
sample_sum = sum(NatRepemployees)
) %>%
mutate(
perc = 100 * (n / sum(n)),
sample_wtd_perc = 100 * (sample_sum/sum(sample_sum))
) %>%
arrange(desc(sample_wtd_perc))
denoms <- rgn_empl_denoms %>%
mutate(
employment_data_perc = 100 * (Employed / sum(Employed))
) %>%
select(-Weight) %>%
arrange(desc(employment_data_perc))
combined <- merge(denoms,our_sample, by="Region") %>%
select(c(Region, Employed, sample_sum, employment_data_perc, sample_wtd_perc)) %>%
arrange(desc(sample_wtd_perc))
# our_sample %>%
# kable() %>%
# kable_styling()
#
# denoms %>%
# kable() %>%
# kable_styling()
combined %>%
kable() %>%
kable_styling()| Region | Employed | sample_sum | employment_data_perc | sample_wtd_perc |
|---|---|---|---|---|
| London | 4829298851 | 1431.0519 | 14.557314 | 14.092092 |
| South East | 4753437758 | 1420.8114 | 14.328640 | 13.991250 |
| North West | 3583507881 | 1095.8237 | 10.802034 | 10.790977 |
| West Midlands | 2882955040 | 879.5148 | 8.690305 | 8.660903 |
| Scotland | 2650815533 | 853.0980 | 7.990550 | 8.400768 |
| South West | 2802120873 | 837.8587 | 8.446641 | 8.250701 |
| East Midlands | 2359055655 | 728.1845 | 7.111077 | 7.170699 |
| Wales | 1436559071 | 461.0781 | 4.330327 | 4.540405 |
| North East | 1215820684 | 386.9397 | 3.664938 | 3.810337 |
| Northern Ireland | 873773187 | 275.2233 | 2.633879 | 2.710224 |
The plot below shows the distribution of outsourced and non outsourced workers across regions. It suggests that an outsourced worker is more likely to be based in London than a non-outsourced worker.
region_statistics <- data %>%
# get values of labels
# mutate_all(haven::as_factor) %>%
group_by(outsourcing_status, Region) %>%
summarise(
Frequency = sum(NatRepemployees)
) %>%
mutate(
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum)
) %>%
rename(
`Outsourcing status` = outsourcing_status
)
region_statistics %>%
mutate(
Region = haven::as_factor(Region)
) %>%
knitr::kable(.,digits = 2) %>%
kable_styling(full_width = F)| Outsourcing status | Region | Frequency | Sum | Percentage |
|---|---|---|---|---|
| Not outsourced | East Midlands | 587.69 | 8446.64 | 6.96 |
| Not outsourced | East of England | 841.35 | 8446.64 | 9.96 |
| Not outsourced | London | 1073.70 | 8446.64 | 12.71 |
| Not outsourced | North East | 333.88 | 8446.64 | 3.95 |
| Not outsourced | North West | 906.43 | 8446.64 | 10.73 |
| Not outsourced | Northern Ireland | 231.66 | 8446.64 | 2.74 |
| Not outsourced | Scotland | 727.28 | 8446.64 | 8.61 |
| Not outsourced | South East | 1232.34 | 8446.64 | 14.59 |
| Not outsourced | South West | 717.36 | 8446.64 | 8.49 |
| Not outsourced | Wales | 377.83 | 8446.64 | 4.47 |
| Not outsourced | West Midlands | 718.03 | 8446.64 | 8.50 |
| Not outsourced | Yorkshire and the Humber | 699.11 | 8446.64 | 8.28 |
| Outsourced | East Midlands | 140.50 | 1708.36 | 8.22 |
| Outsourced | East of England | 125.49 | 1708.36 | 7.35 |
| Outsourced | London | 357.35 | 1708.36 | 20.92 |
| Outsourced | North East | 53.06 | 1708.36 | 3.11 |
| Outsourced | North West | 189.39 | 1708.36 | 11.09 |
| Outsourced | Northern Ireland | 43.56 | 1708.36 | 2.55 |
| Outsourced | Scotland | 125.82 | 1708.36 | 7.37 |
| Outsourced | South East | 188.47 | 1708.36 | 11.03 |
| Outsourced | South West | 120.50 | 1708.36 | 7.05 |
| Outsourced | Wales | 83.25 | 1708.36 | 4.87 |
| Outsourced | West Midlands | 161.49 | 1708.36 | 9.45 |
| Outsourced | Yorkshire and the Humber | 119.46 | 1708.36 | 6.99 |
region_statistics %>%
mutate(
Region = haven::as_factor(Region)
) %>%
ggplot(., aes(`Outsourcing status`, Percentage, fill = Region)) +
geom_col(colour="black") +
coord_flip() +
scale_fill_manual(values=many_colours) +
theme_minimal()readr::write_csv(region_statistics, file="../outputs/data/region_statistics.csv")In the plot below the percentages have been scaled to the size of the working population in the region as a function of the total working population in the UK. I need to check whether this scaling is actually necessary, given we are already using weighted data.21 Does the weighting process account for region?
Below we calculate the number of outsourced workers within each region.
region_statistics_2 <- data %>%
# get values of labels
# mutate_all(haven::as_factor) %>%
group_by(Region, outsourcing_status) %>%
summarise(
Frequency = sum(NatRepemployees)
) %>%
mutate(
Sum = sum(Frequency),
Percentage = 100 * (Frequency / Sum)
) %>%
rename(
`Outsourcing status` = outsourcing_status
)
region_statistics_2 %>%
ggplot(., aes(Region, Percentage, fill = `Outsourcing status`)) +
geom_col(colour="black") +
coord_flip() +
scale_fill_manual(values=many_colours) +
theme_minimal()readr::write_csv(region_statistics_2, file = "../outputs/data/region_stats_2.csv")
region_statistics_2_1 <- region_statistics_2 %>%
filter(`Outsourcing status` == "Outsourced" & Region != "London")
london_perc <- region_statistics_2[which(region_statistics_2$Region == "London" & region_statistics_2["Outsourcing status"] == "Outsourced"), "Percentage"]Visualised on a map:
knitr::include_graphics('../outputs/figures/outsourcing_by_region.svg')Visualised on a map, excluding London to enable us to differentiate better between other regions:
knitr::include_graphics('../outputs/figures/outsourcing_by_region_excl_london.svg')As we can see, London has the highest proportion of outsourced workers (25%). After London, the regions with the highest proportion of outsourced workers are:
age_statistics <- data %>%
group_by(outsourcing_status) %>%
summarise(
mean = weighted.mean(Age, w = NatRepemployees, na.rm = T),
median = wtd.quantile(Age, w = NatRepemployees, probs = c(.5), na.rm = T),
min = wtd.quantile(Age, w = NatRepemployees, probs = c(0), na.rm = T),
max = wtd.quantile(Age, w = NatRepemployees, probs = c(1), na.rm = T),
stdev = sqrt(wtd.var(Age, w = NatRepemployees, na.rm = T))
)
readr::write_csv(age_statistics, file = "../outputs/data/age_stats.csv")As shown in the table below, the median age of the outsourced group is 36 , compared to 43 for the not outsourced group.22
However, as the next figure shows, the age distribution is different for the outsourced and high indicator groups compared to the not outsourced and likely agency groups; the outsourced and high indicator groups have higher proportions of younger people (~21-36 year olds).
A t-test indicates that on average, outsourced workers are significantly younger than non-outsourced workers (t(2399.2) = 11.95, p = 0).
knitr::kable(age_statistics,
digits = 2,
col.names = c("Outsourcing group",
"Mean",
"Median",
"Min",
"Max",
"Standard dev.")) %>%
kable_styling(full_width = F)| Outsourcing group | Mean | Median | Min | Max | Standard dev. |
|---|---|---|---|---|---|
| Not outsourced | 42.80 | 43 | 16 | 80 | 13.08 |
| Outsourced | 38.63 | 36 | 16 | 78 | 13.07 |
data %>%
mutate(
Age = as.numeric(as.character(as_factor(Age)))
) %>%
ggplot(.,aes(Age, colour = outsourcing_status, fill = outsourcing_status)) +
geom_density(alpha = 0.3) +
geom_vline(data =age_statistics, aes(xintercept=median, colour = outsourcing_status)) +
scale_x_continuous(breaks = seq(min(age_statistics$min), max(age_statistics$max),5)) +
theme_minimal() +
scale_colour_manual(values=colours, name = "Outsourcing status") +
scale_fill_manual(values=colours, name = "Outsourcing status")age_statistics_2 <- data %>%
group_by(outsourcing_group) %>%
summarise(
mean = weighted.mean(Age, w = NatRepemployees, na.rm = T),
median = wtd.quantile(Age, w = NatRepemployees, probs = c(.5), na.rm = T),
min = wtd.quantile(Age, w = NatRepemployees, probs = c(0), na.rm = T),
max = wtd.quantile(Age, w = NatRepemployees, probs = c(1), na.rm = T),
stdev = sqrt(wtd.var(Age, w = NatRepemployees, na.rm = T))
)
readr::write_csv(age_statistics_2, file = "../outputs/data/age_stats_2.csv")Exploring the age distribution for the different outsourced groups, the high density concentration of slightly younger workers identified above appears to be driven primarily by the ‘outsourced’ and ‘high indicator’ groups. The ‘likely agency’ group follows a similar pattern, but has a lower density peak than the other groups, with a higher density of workers of more advanced ages.
knitr::kable(age_statistics_2,
digits = 2,
col.names = c("Outsourcing group",
"Mean",
"Median",
"Min",
"Max",
"Standard dev.")) %>%
kable_styling(full_width = F)| Outsourcing group | Mean | Median | Min | Max | Standard dev. |
|---|---|---|---|---|---|
| Not outsourced | 42.80 | 43 | 16 | 80 | 13.08 |
| Outsourced | 38.40 | 35 | 16 | 78 | 13.09 |
| Likely agency | 39.80 | 38 | 18 | 77 | 13.49 |
| High indicators | 38.49 | 35 | 18 | 72 | 12.55 |
data %>%
ggplot(.,aes(Age, colour = outsourcing_group, fill = outsourcing_group)) +
geom_density(alpha = 0.2) +
geom_vline(data = age_statistics_2, aes(xintercept=median, colour = outsourcing_group)) +
scale_x_continuous(breaks = seq(min(age_statistics_2$min), max(age_statistics_2$max),5)) +
theme_minimal() +
scale_colour_manual(values=better_colours, name = "Outsourcing group") +
scale_fill_manual(values=better_colours, name = "Outsourcing group")Data file: “outputs/data/total_outsourced.csv”↩︎
Data file: outputs/data/income_stats.csv↩︎
Datafile: “../outputs/data/majorgroupcode_summary.csv”↩︎
Data file: “../outputs/data/majorgroupcode_summary_long.csv” & “../outputs/data/majorgroupcode_summary_wide.csv”)↩︎
“../outputs/data/elementary_occs_summary.csv”↩︎
“../outputs/data/process_occs_summary.csv”↩︎
“../outputs/data/caring_occs_summary.csv”↩︎
“outputs/data/sector_summary.csv”↩︎
“outputs/data/sector_summary_2.csv”↩︎
“outputs/data/sector_summary_3.csv”↩︎
“outputs/sector_summary_paysplit.csv”↩︎
Data file: outputs/data/ethnicity_stats_1.csv↩︎
“outputs/data/ethnicity_stats_paysplit_1.csv”↩︎
Data file: outputs/data/ethnicity_stats_2.csv↩︎
“outputs/data/ethnicity_stats_paysplit_1.csv”↩︎
Data file: outputs/data/ethnicity_stats_2.csv↩︎
Data file: outputs/data/arrival_in_UK_stats.csv↩︎
../outputs/data/interaction_ethnicity_arrival_in_UK_stats.csv↩︎
Data file: outputs/data/gender_statistics.csv↩︎
Data file: outputs/data/region_statistics.csv↩︎
../outputs/data/region_statistics_weighted.csv↩︎
Data file: outputs/data/age_stats.csv↩︎
“outputs/data/age_stats_2.csv”↩︎